From 89ed6773e76e18dcfddefdff01ba9d09e66c39fb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 6 Mar 2017 09:55:36 +0100 Subject: [PATCH 001/638] Add 'contrib/kube-prometheus/' from commit '81c0d2f4d30f63a4e274c2870c5afc89241827b0' git-subtree-dir: contrib/kube-prometheus git-subtree-mainline: 050ca21276696c8603375c699513ec487301ed62 git-subtree-split: 81c0d2f4d30f63a4e274c2870c5afc89241827b0 --- README.md | 138 + assets/grafana/all-nodes-dashboard.json | 860 +++++ assets/grafana/deployment-dashboard.json | 817 +++++ assets/grafana/kubernetes-pods-dashboard.json | 409 +++ assets/grafana/node-dashboard.json | 880 +++++ assets/grafana/prometheus-datasource.json | 7 + assets/prometheus/rules/etcd2.rules | 121 + assets/prometheus/rules/kubernetes.rules | 388 +++ docs/KOPSonAWS.md | 35 + hack/cluster-monitoring/deploy | 41 + hack/cluster-monitoring/minikube-deploy | 6 + hack/cluster-monitoring/minikube-teardown | 6 + hack/cluster-monitoring/self-hosted-deploy | 6 + hack/cluster-monitoring/self-hosted-teardown | 6 + hack/cluster-monitoring/teardown | 24 + hack/example-service-monitoring/deploy | 19 + hack/example-service-monitoring/teardown | 12 + hack/scripts/generate-configmaps.sh | 8 + hack/scripts/wrap-dashboard.sh | 50 + .../alertmanager/alertmanager-config.yaml | 18 + .../alertmanager/alertmanager-service.yaml | 14 + manifests/alertmanager/alertmanager.yaml | 9 + manifests/etcd/etcd-bootkube-gce.yaml | 28 + .../etcd/etcd-bootkube-vagrant-multi.yaml | 28 + .../examples/example-app/example-app.yaml | 34 + .../example-app/prometheus-frontend-svc.yaml | 14 + .../example-app/prometheus-frontend.yaml | 24 + .../example-app/servicemonitor-frontend.yaml | 13 + .../kube-state-metrics-deployment.yaml | 25 + .../exporters/kube-state-metrics-service.yaml | 18 + .../exporters/node-exporter-daemonset.yaml | 45 + .../exporters/node-exporter-service.yaml | 17 + manifests/grafana/grafana-dashboards.yaml | 2984 +++++++++++++++++ manifests/grafana/grafana-deployment.yaml | 56 + manifests/grafana/grafana-service.yaml | 15 + .../k8s/minikube/kube-controller-manager.yaml | 28 + manifests/k8s/minikube/kube-scheduler.yaml | 28 + .../self-hosted/kube-controller-manager.yaml | 16 + manifests/k8s/self-hosted/kube-dns.yaml | 20 + manifests/k8s/self-hosted/kube-scheduler.yaml | 16 + manifests/prometheus-operator.yaml | 26 + .../prometheus/prometheus-k8s-rules.yaml | 447 +++ .../prometheus/prometheus-k8s-service.yaml | 14 + .../prometheus-k8s-servicemonitors.yaml | 69 + manifests/prometheus/prometheus-k8s.yaml | 24 + 45 files changed, 7863 insertions(+) create mode 100644 README.md create mode 100644 assets/grafana/all-nodes-dashboard.json create mode 100644 assets/grafana/deployment-dashboard.json create mode 100644 assets/grafana/kubernetes-pods-dashboard.json create mode 100644 assets/grafana/node-dashboard.json create mode 100644 assets/grafana/prometheus-datasource.json create mode 100644 assets/prometheus/rules/etcd2.rules create mode 100644 assets/prometheus/rules/kubernetes.rules create mode 100644 docs/KOPSonAWS.md create mode 100755 hack/cluster-monitoring/deploy create mode 100755 hack/cluster-monitoring/minikube-deploy create mode 100755 hack/cluster-monitoring/minikube-teardown create mode 100755 hack/cluster-monitoring/self-hosted-deploy create mode 100755 hack/cluster-monitoring/self-hosted-teardown create mode 100755 hack/cluster-monitoring/teardown create mode 100755 hack/example-service-monitoring/deploy create mode 100755 hack/example-service-monitoring/teardown create mode 100755 hack/scripts/generate-configmaps.sh create mode 100755 hack/scripts/wrap-dashboard.sh create mode 100644 manifests/alertmanager/alertmanager-config.yaml create mode 100644 manifests/alertmanager/alertmanager-service.yaml create mode 100644 manifests/alertmanager/alertmanager.yaml create mode 100644 manifests/etcd/etcd-bootkube-gce.yaml create mode 100644 manifests/etcd/etcd-bootkube-vagrant-multi.yaml create mode 100644 manifests/examples/example-app/example-app.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend-svc.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend.yaml create mode 100644 manifests/examples/example-app/servicemonitor-frontend.yaml create mode 100644 manifests/exporters/kube-state-metrics-deployment.yaml create mode 100644 manifests/exporters/kube-state-metrics-service.yaml create mode 100644 manifests/exporters/node-exporter-daemonset.yaml create mode 100644 manifests/exporters/node-exporter-service.yaml create mode 100644 manifests/grafana/grafana-dashboards.yaml create mode 100644 manifests/grafana/grafana-deployment.yaml create mode 100644 manifests/grafana/grafana-service.yaml create mode 100644 manifests/k8s/minikube/kube-controller-manager.yaml create mode 100644 manifests/k8s/minikube/kube-scheduler.yaml create mode 100644 manifests/k8s/self-hosted/kube-controller-manager.yaml create mode 100644 manifests/k8s/self-hosted/kube-dns.yaml create mode 100644 manifests/k8s/self-hosted/kube-scheduler.yaml create mode 100644 manifests/prometheus-operator.yaml create mode 100644 manifests/prometheus/prometheus-k8s-rules.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service.yaml create mode 100644 manifests/prometheus/prometheus-k8s-servicemonitors.yaml create mode 100644 manifests/prometheus/prometheus-k8s.yaml diff --git a/README.md b/README.md new file mode 100644 index 00000000..db4f554d --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# kube-prometheus + +This repository collects Kubernetes manifests, dashboards, and alerting rules +combined with documentation and scripts to provide single-command deployments +of end-to-end Kubernetes cluster monitoring. + +## Prerequisites + +First, you need a running Kubernetes cluster. If you don't have one, follow the +instructions of [bootkube](https://github.com/kubernetes-incubator/bootkube) or +[minikube](https://github.com/kubernetes/minikube). Some sample contents of this +repository are adapted to work with a [multi-node setup](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node) +using [bootkube](https://github.com/kubernetes-incubator/bootkube). + +## Monitoring Kubernetes + +The manifests used here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), +which manages Prometheus servers and their configuration in a cluster. With a single command we can install + +* The Operator itself +* The Prometheus [node_exporter](https://github.com/prometheus/node_exporter) +* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) +* The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/prometheus.md) based on which the Operator deploys a Prometheus setup +* A Prometheus configuration covering monitoring of all Kubernetes core components and exporters +* A default set of alerting rules on the cluster component's health +* A Grafana instance serving dashboards on cluster metrics +* A three node highly available Alertmanager cluster + +Simply run: + +```bash +export KUBECONFIG= # defaults to "~/.kube/config" +hack/cluster-monitoring/deploy +``` + +After all pods are ready, you can reach: + +* Prometheus UI on node port `30900` +* Alertmanager UI on node port `30903` +* Grafana on node port `30902` + +To tear it all down again, run: + +```bash +hack/cluster-monitoring/teardown +``` + +## Monitoring custom services + +The example manifests in [/manifests/examples/example-app](/manifests/examples/example-app) +deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus +server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/service-monitor.md), +which specifies how the example service should be monitored. +The Prometheus Operator will deploy and configure the desired Prometheus instance and continiously +manage its life cycle. + +```bash +hack/example-service-monitoring/deploy +``` + +After all pods are ready you can reach the Prometheus server on node port `30100` and observe +how it monitors the service as specified. Same as before, this Prometheus server automatically +discovers the Alertmanager cluster deployed in the [Monitoring Kubernetes](#Monitoring-Kubernetes) +section. + +Teardown: + +```bash +hack/example-service-monitoring/teardown +``` + +## Dashboarding + +The provided manifests deploy a Grafana instance serving dashboards provided via a ConfigMap. +To modify, delete, or add dashboards, the `grafana-dashboards` ConfigMap must be modified. + +Currently, Grafana does not support serving dashboards from static files. Instead, the `grafana-watcher` +sidecar container aims to emulate the behavior, by keeping the Grafana database always in sync +with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. +This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. + +In the future, a separate Grafana operator will support gathering dashboards from multiple +ConfigMaps based on label selection. + +## Roadmap + +* Grafana Operator that dynamically discovers and deploys dashboards from ConfigMaps +* KPM/Helm packages to easily provide production-ready cluster-monitoring setup (essentially contents of `hack/cluster-monitoring`) +* Add meta-monitoring to default cluster monitoring setup +* Build out the provided dashboards and alerts for cluster monitoring to have full coverage of all system aspects + +## Monitoring other Cluster Components + +Discovery of API servers and kubelets works the same across all clusters. +Depending on a cluster's setup several other core components, such as etcd or the +scheduler, may be deployed in different ways. +The easiest integration point is for the cluster operator to provide headless services +of all those components to provide a common interface of discovering them. With that +setup they will automatically be discovered by the provided Prometheus configuration. + +For the `kube-scheduler` and `kube-controller-manager` there are headless +services prepared, simply add them to your running cluster: + +```bash +kubectl -n kube-system create manifests/k8s/ +``` + +> Hint: if you use this for a cluster not created with bootkube, make sure you +> populate an endpoints object with the address to your `kube-scheduler` and +> `kube-controller-manager`, or adapt the label selectors to match your setup. + +Aside from Kubernetes specific components, etcd is an important part of a +working cluster, but is typically deployed outside of it. This monitoring +setup assumes that it is made visible from within the cluster through a headless +service as well. + +> Note that minikube hides some components like etcd so to see the extend of +> this setup we recommend setting up a [local cluster using bootkube](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node). + +An example for bootkube's multi-node vagrant setup is [here](/manifests/etcd/etcd-bootkube-vagrant-multi.yaml). + +> Hint: this is merely an example for a local setup. The addresses will have to +> be adapted for a setup, that is not a single etcd bootkube created cluster. + +With that setup the headless services provide endpoint lists consumed by +Prometheus to discover the endpoints as targets: + +```bash +$ kubectl get endpoints --all-namespaces +NAMESPACE NAME ENDPOINTS AGE +default kubernetes 172.17.4.101:443 2h +kube-system kube-controller-manager-prometheus-discovery 10.2.30.2:10252 1h +kube-system kube-scheduler-prometheus-discovery 10.2.30.4:10251 1h +monitoring etcd-k8s 172.17.4.51:2379 1h +``` + +## Other Documentation +[Install Docs for a cluster created with KOPS on AWS](docs/KOPSonAWS.md) diff --git a/assets/grafana/all-nodes-dashboard.json b/assets/grafana/all-nodes-dashboard.json new file mode 100644 index 00000000..7a7c2bde --- /dev/null +++ b/assets/grafana/all-nodes-dashboard.json @@ -0,0 +1,860 @@ +{ + "dashboard": +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json new file mode 100644 index 00000000..69638d15 --- /dev/null +++ b/assets/grafana/deployment-dashboard.json @@ -0,0 +1,817 @@ +{ + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "id": null, + "title": "Deployment", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": true, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "title": "CPU", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 8, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "cores", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Memory", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 9, + "targets": [ + { + "refId": "A", + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "GB", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "80%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Network", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 7, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "Bps", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + } + } + ], + "title": "Row", + "showTitle": false + }, + { + "title": "New row", + "height": "100px", + "editable": true, + "collapse": false, + "panels": [ + { + "title": "Desired Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 5, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600, + "metric": "kube_deployment_spec_replicas" + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + }, + "decimals": null + }, + { + "title": "Available Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 6, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ] + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false, + "hideZero": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "transparent": false + } + ], + "title": "New row", + "showTitle": false + } + ], + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "annotations": { + "list": [] + }, + "schemaVersion": 12, + "version": 2, + "links": [], + "gnetId": null +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-pods-dashboard.json b/assets/grafana/kubernetes-pods-dashboard.json new file mode 100644 index 00000000..035da015 --- /dev/null +++ b/assets/grafana/kubernetes-pods-dashboard.json @@ -0,0 +1,409 @@ +{ + "dashboard": { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_requested_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/node-dashboard.json b/assets/grafana/node-dashboard.json new file mode 100644 index 00000000..78a5bb37 --- /dev/null +++ b/assets/grafana/node-dashboard.json @@ -0,0 +1,880 @@ +{ + "dashboard": +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 +}, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true +} diff --git a/assets/grafana/prometheus-datasource.json b/assets/grafana/prometheus-datasource.json new file mode 100644 index 00000000..47b8f1b2 --- /dev/null +++ b/assets/grafana/prometheus-datasource.json @@ -0,0 +1,7 @@ +{ + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090" +} diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules new file mode 100644 index 00000000..4a38894e --- /dev/null +++ b/assets/prometheus/rules/etcd2.rules @@ -0,0 +1,121 @@ +### General cluster availability ### + +# alert if another failed peer will result in an unavailable cluster +ALERT InsufficientPeers + IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Etcd cluster small", + description = "If one more etcd peer goes down the cluster will be unavailable", + } + +### HTTP requests alerts ### + +# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if 50% of requests get a 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + +### File descriptor alerts ### + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +### etcd proposal alerts ### + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of failed proposals within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + +### etcd disk io latency alerts ### + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "ectd instance {{ $labels.instance }} fync durations are high", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules new file mode 100644 index 00000000..157eb3fa --- /dev/null +++ b/assets/prometheus/rules/kubernetes.rules @@ -0,0 +1,388 @@ +# NOTE: These rules were kindly contributed by the SoundCloud engineering team. + +### Container resources ### + +cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + +### Cluster resources ### + +cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + +cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + +cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + +### API latency ### + +# Raw metrics are in microseconds. Convert to seconds. +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 +cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + +### Scheduling latency ### + +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 +cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + +ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + +# Disable for non HA kubernetes setups. +ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +# To catch the conntrack sysctl de-tuning when it happens +ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + +ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md new file mode 100644 index 00000000..902dab74 --- /dev/null +++ b/docs/KOPSonAWS.md @@ -0,0 +1,35 @@ +# Adding kube-prometheus to [KOPS](https://github.com/kubernetes/kops) on AWS 1.5.x + + +## Prerequisites + +A running Kubernetes cluster created with [KOPS](https://github.com/kubernetes/kops). + +These instructions have currently been tested with **topology=public** on AWS with KOPS 1.5.1 and Kubernetes 1.5.x + +## Open AWS Security Groups: +1. Open port 9100 on the masters security group to the nodes security group +1. Open ports 10250-10252 on the masters security group to the nodes security group. + +Example script below requires $AWS\_DEFAULT_PROFILE and [$NAME](https://github.com/kubernetes/kops/blob/master/docs/aws.md#prepare-local-environment) + +```bash +MASTER_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=masters.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) +NODES_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=nodes.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) +aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 9100 --source-group $NODES_SG +aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 10250-10252 --source-group $NODES_SG +``` + +## Adding kube-prometheus +Following the instructions in the [README](https://github.com/coreos/kube-prometheus/blob/master/README.md): + +Example: + +```bash +git clone -b master https://github.com/coreos/kube-prometheus.git kube-prometheus-temp; +cd kube-prometheus-temp +./hack/cluster-monitoring/deploy +kubectl -n kube-system create -f manifests/k8s/self-hosted/ +cd - +rm -rf kube-prometheus-temp +``` diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy new file mode 100755 index 00000000..9ad91eb0 --- /dev/null +++ b/hack/cluster-monitoring/deploy @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kubectl create namespace "$NAMESPACE" + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kctl apply -f manifests/prometheus-operator.yaml + +# Wait for TPRs to be ready. +printf "Waiting for Operator to register third party objects..." +until kctl get servicemonitor > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done +echo "done!" + +kctl apply -f manifests/exporters +kctl apply -f manifests/grafana + +kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml +kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml + +kctl apply -f manifests/alertmanager/alertmanager-config.yaml +kctl apply -f manifests/alertmanager/alertmanager-service.yaml + +# `kubectl apply` is currently not working for third party resources so we are +# using `kubectl create` here for the time being. +# (https://github.com/kubernetes/kubernetes/issues/29542) +kctl create -f manifests/prometheus/prometheus-k8s-servicemonitors.yaml +kctl create -f manifests/prometheus/prometheus-k8s.yaml +kctl create -f manifests/alertmanager/alertmanager.yaml + diff --git a/hack/cluster-monitoring/minikube-deploy b/hack/cluster-monitoring/minikube-deploy new file mode 100755 index 00000000..ab7e72e4 --- /dev/null +++ b/hack/cluster-monitoring/minikube-deploy @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/deploy + +awk 'FNR==1{print "---"}1' manifests/k8s/minikube/*.yaml | sed s/MINIKUBE_IP/`minikube ip`/g | kubectl --namespace=kube-system apply -f - + diff --git a/hack/cluster-monitoring/minikube-teardown b/hack/cluster-monitoring/minikube-teardown new file mode 100755 index 00000000..3a4c986e --- /dev/null +++ b/hack/cluster-monitoring/minikube-teardown @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/teardown + +kubectl --namespace=kube-system delete -f manifests/k8s/minikube + diff --git a/hack/cluster-monitoring/self-hosted-deploy b/hack/cluster-monitoring/self-hosted-deploy new file mode 100755 index 00000000..a25f7ed3 --- /dev/null +++ b/hack/cluster-monitoring/self-hosted-deploy @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/deploy + +kubectl --namespace=kube-system apply -f manifests/k8s/self-hosted + diff --git a/hack/cluster-monitoring/self-hosted-teardown b/hack/cluster-monitoring/self-hosted-teardown new file mode 100755 index 00000000..05fd625a --- /dev/null +++ b/hack/cluster-monitoring/self-hosted-teardown @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +hack/cluster-monitoring/teardown + +kubectl --namespace=kube-system delete -f manifests/k8s/self-hosted + diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown new file mode 100755 index 00000000..45ae61ed --- /dev/null +++ b/hack/cluster-monitoring/teardown @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kctl delete -f manifests/exporters +kctl delete -f manifests/grafana +kctl delete -f manifests/prometheus +kctl delete -f manifests/alertmanager + +# Hack: wait a bit to let the controller delete the deployed Prometheus server. +sleep 5 + +kctl delete -f manifests/prometheus-operator.yaml + diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy new file mode 100755 index 00000000..420b5940 --- /dev/null +++ b/hack/example-service-monitoring/deploy @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=default +fi + +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-svc.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/example-app.yaml + +# `kubectl apply` is currently not working for third party resources so we are +# using `kubectl create` here for the time being. +# (https://github.com/kubernetes/kubernetes/issues/29542) +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/prometheus-frontend.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/servicemonitor-frontend.yaml + diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown new file mode 100755 index 00000000..a631fe3e --- /dev/null +++ b/hack/example-service-monitoring/teardown @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + KUBECONFIG=~/.kube/config +fi + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=default +fi + +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" delete -f manifests/examples/example-app + diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh new file mode 100755 index 00000000..02ba18e9 --- /dev/null +++ b/hack/scripts/generate-configmaps.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Generate Alert Rules ConfigMap +kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml + +# Generate Dashboard ConfigMap +kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml + diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh new file mode 100755 index 00000000..1b514387 --- /dev/null +++ b/hack/scripts/wrap-dashboard.sh @@ -0,0 +1,50 @@ +#!/bin/bash -eu + +# Intended usage: +# * Edit dashboard in Grafana (you need to login first with admin/admin +# login/password). +# * Save dashboard in Grafana to check is specification is correct. +# Looks like this is the only way to check is dashboard specification +# has error. +# * Download dashboard specification as JSON file in Grafana: +# Share -> Export -> Save to file. +# * Wrap dashboard specification to make it digestable by kube-prometheus: +# ./hack/scripts/wrap-dashboard.sh Nodes-1488465802729.json +# * Replace dashboard specification: +# mv Nodes-1488465802729.json assets/grafana/node-dashboard.json +# * Regenerate Grafana configmap: +# ./hack/scripts/generate-configmaps.sh +# * Apply new configmap: +# kubectl -n monitoring apply -f manifests/grafana/grafana-cm.yaml + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 path-to-dashboard.json" + exit 1 +fi + +json=$1 +temp=$(tempfile -m 0644) + +cat >> $temp <> $temp + +cat >> $temp <2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml new file mode 100644 index 00000000..4ceaacd6 --- /dev/null +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: frontend + labels: + tier: frontend +spec: + selector: + matchLabels: + tier: frontend + endpoints: + - port: web + interval: 10s \ No newline at end of file diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/exporters/kube-state-metrics-deployment.yaml new file mode 100644 index 00000000..6ef971ce --- /dev/null +++ b/manifests/exporters/kube-state-metrics-deployment.yaml @@ -0,0 +1,25 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: kube-state-metrics +spec: + replicas: 1 + template: + metadata: + labels: + app: kube-state-metrics + spec: + containers: + - name: kube-state-metrics + image: gcr.io/google_containers/kube-state-metrics:v0.4.1 + ports: + - name: metrics + containerPort: 8080 + resources: + requests: + memory: 30Mi + cpu: 100m + limits: + memory: 50Mi + cpu: 200m + diff --git a/manifests/exporters/kube-state-metrics-service.yaml b/manifests/exporters/kube-state-metrics-service.yaml new file mode 100644 index 00000000..607869e1 --- /dev/null +++ b/manifests/exporters/kube-state-metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: kube-state-metrics + k8s-app: kube-state-metrics + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" + name: kube-state-metrics +spec: + ports: + - name: http-metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: kube-state-metrics + diff --git a/manifests/exporters/node-exporter-daemonset.yaml b/manifests/exporters/node-exporter-daemonset.yaml new file mode 100644 index 00000000..8c9565ba --- /dev/null +++ b/manifests/exporters/node-exporter-daemonset.yaml @@ -0,0 +1,45 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: node-exporter +spec: + template: + metadata: + labels: + app: node-exporter + name: node-exporter + spec: + hostNetwork: true + hostPID: true + containers: + - image: quay.io/prometheus/node-exporter:v0.13.0 + args: + - "-collector.procfs=/host/proc" + - "-collector.sysfs=/host/sys" + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: scrape + resources: + requests: + memory: 30Mi + cpu: 100m + limits: + memory: 50Mi + cpu: 200m + volumeMounts: + - name: proc + readOnly: true + mountPath: /host/proc + - name: sys + readOnly: true + mountPath: /host/sys + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + diff --git a/manifests/exporters/node-exporter-service.yaml b/manifests/exporters/node-exporter-service.yaml new file mode 100644 index 00000000..46b1a3fd --- /dev/null +++ b/manifests/exporters/node-exporter-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: node-exporter + k8s-app: node-exporter + name: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 9100 + protocol: TCP + selector: + app: node-exporter + diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml new file mode 100644 index 00000000..f9757dc1 --- /dev/null +++ b/manifests/grafana/grafana-dashboards.yaml @@ -0,0 +1,2984 @@ +apiVersion: v1 +data: + all-nodes-dashboard.json: | + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + deployment-dashboard.json: |- + { + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "id": null, + "title": "Deployment", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": true, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "title": "CPU", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 8, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "cores", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Memory", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 9, + "targets": [ + { + "refId": "A", + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "GB", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "80%", + "valueFontSize": "110%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "title": "Network", + "error": false, + "span": 4, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 7, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "Bps", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": true, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + } + } + ], + "title": "Row", + "showTitle": false + }, + { + "title": "New row", + "height": "100px", + "editable": true, + "collapse": false, + "panels": [ + { + "title": "Desired Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 5, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600, + "metric": "kube_deployment_spec_replicas" + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": false, + "thresholdLabels": false + }, + "decimals": null + }, + { + "title": "Available Replicas", + "error": false, + "span": 3, + "editable": true, + "type": "singlestat", + "isNew": true, + "id": 6, + "targets": [ + { + "refId": "A", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "step": 600 + } + ], + "links": [], + "datasource": "${DS_PROMETHEUS}", + "maxDataPoints": 100, + "interval": null, + "cacheTimeout": null, + "format": "none", + "prefix": "", + "postfix": "", + "nullText": null, + "valueMaps": [ + { + "value": "null", + "op": "=", + "text": "N/A" + } + ], + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "rangeMaps": [ + { + "from": "null", + "to": "null", + "text": "N/A" + } + ], + "mappingType": 1, + "nullPointMode": "connected", + "valueName": "avg", + "prefixFontSize": "50%", + "valueFontSize": "80%", + "postfixFontSize": "50%", + "thresholds": "", + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "sparkline": { + "show": false, + "full": false, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 118, 189, 0.18)" + }, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ] + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false, + "hideZero": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "transparent": false + } + ], + "title": "New row", + "showTitle": false + } + ], + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "annotations": { + "list": [] + }, + "schemaVersion": 12, + "version": 2, + "links": [], + "gnetId": null + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + kubernetes-pods-dashboard.json: | + { + "dashboard": { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_requested_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + node-dashboard.json: | + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + prometheus-datasource.json: | + { + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090" + } +kind: ConfigMap +metadata: + creationTimestamp: null + name: grafana-dashboards diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml new file mode 100644 index 00000000..5a70df49 --- /dev/null +++ b/manifests/grafana/grafana-deployment.yaml @@ -0,0 +1,56 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:4.1.1 + env: + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + volumeMounts: + - name: grafana-storage + mountPath: /var/grafana-storage + ports: + - name: web + containerPort: 3000 + resources: + requests: + memory: 100Mi + cpu: 100m + limits: + memory: 300Mi + cpu: 300m + - name: grafana-watcher + image: quay.io/coreos/grafana-watcher:latest + args: + - '--watch-dir=/var/grafana-dashboards' + - '--grafana-url=http://admin:admin@localhost:3000' + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards + resources: + requests: + memory: "16Mi" + cpu: "50m" + limits: + memory: "32Mi" + cpu: "100m" + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards + volumes: + - name: grafana-storage + emptyDir: {} + - name: grafana-dashboards + configMap: + name: grafana-dashboards diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana/grafana-service.yaml new file mode 100644 index 00000000..adb26233 --- /dev/null +++ b/manifests/grafana/grafana-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app: grafana +spec: + type: NodePort + ports: + - name: web + port: 3000 + protocol: TCP + nodePort: 30902 + selector: + app: grafana diff --git a/manifests/k8s/minikube/kube-controller-manager.yaml b/manifests/k8s/minikube/kube-controller-manager.yaml new file mode 100644 index 00000000..135dd24c --- /dev/null +++ b/manifests/k8s/minikube/kube-controller-manager.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +subsets: +- addresses: + - ip: MINIKUBE_IP + ports: + - name: http-metrics + port: 10252 + protocol: TCP diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml new file mode 100644 index 00000000..b3b51f38 --- /dev/null +++ b/manifests/k8s/minikube/kube-scheduler.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +subsets: +- addresses: + - ip: MINIKUBE_IP + ports: + - name: http-metrics + port: 10251 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-controller-manager.yaml b/manifests/k8s/self-hosted/kube-controller-manager.yaml new file mode 100644 index 00000000..2f22a6f2 --- /dev/null +++ b/manifests/k8s/self-hosted/kube-controller-manager.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + selector: + k8s-app: kube-controller-manager + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-dns.yaml b/manifests/k8s/self-hosted/kube-dns.yaml new file mode 100644 index 00000000..36d9a0ad --- /dev/null +++ b/manifests/k8s/self-hosted/kube-dns.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-dns-prometheus-discovery + labels: + k8s-app: kube-dns +spec: + selector: + k8s-app: kube-dns + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics-skydns + port: 10055 + targetPort: 10055 + protocol: TCP + - name: http-metrics-dnsmasq + port: 10054 + targetPort: 10054 + protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-scheduler.yaml b/manifests/k8s/self-hosted/kube-scheduler.yaml new file mode 100644 index 00000000..331998fe --- /dev/null +++ b/manifests/k8s/self-hosted/kube-scheduler.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + selector: + k8s-app: kube-scheduler + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator.yaml new file mode 100644 index 00000000..6c8030eb --- /dev/null +++ b/manifests/prometheus-operator.yaml @@ -0,0 +1,26 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus-operator + labels: + operator: prometheus +spec: + replicas: 1 + template: + metadata: + labels: + operator: prometheus + spec: + containers: + - name: prometheus-operator + image: quay.io/coreos/prometheus-operator:v0.6.0 + args: + - "--kubelet-object=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 300Mi diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml new file mode 100644 index 00000000..08f6dddc --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -0,0 +1,447 @@ +apiVersion: v1 +data: + etcd2.rules: "### General cluster availability ###\n\n# alert if another failed + peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"} + == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity + = \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n + \ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n + \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to + an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n + \ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) > + 0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP + endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m])) + \n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT + HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", + code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses + on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile + of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99, + rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP + requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP + requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts + ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert + if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n + \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors + soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance + }} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors + are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m], + 3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"file descriptors soon exhausted\",\n description = \"{{ + $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors + soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed + proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h]) + > 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary + = \"a high number of failed proposals within the etcd cluster are happening\",\n + \ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }} + proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts + ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT + HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"high fsync durations\",\n description = \"ectd instance {{ + $labels.instance }} fync durations are high\",\n }\n" + kubernetes.rules: |+ + # NOTE: These rules were kindly contributed by the SoundCloud engineering team. + + ### Container resources ### + + cluster_namespace_controller_pod_container:spec_memory_limit_bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:spec_cpu_shares = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_spec_cpu_shares{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:cpu_usage:rate = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + irate( + container_cpu_usage_seconds_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_working_set:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_rss:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_rss{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_cache:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_memory_cache{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:disk_usage:bytes = + sum by (cluster,namespace,controller,pod_name,container_name) ( + label_replace( + container_disk_usage_bytes{container_name!=""}, + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_pagefaults:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failures_total{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + cluster_namespace_controller_pod_container:memory_oom:rate = + sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( + label_replace( + irate( + container_memory_failcnt{container_name!=""}[5m] + ), + "controller", "$1", + "pod_name", "^(.*)-[a-z0-9]+" + ) + ) + + ### Cluster resources ### + + cluster:memory_allocation:percent = + 100 * sum by (cluster) ( + container_spec_memory_limit_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:memory_used:percent = + 100 * sum by (cluster) ( + container_memory_usage_bytes{pod_name!=""} + ) / sum by (cluster) ( + machine_memory_bytes + ) + + cluster:cpu_allocation:percent = + 100 * sum by (cluster) ( + container_spec_cpu_shares{pod_name!=""} + ) / sum by (cluster) ( + container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores + ) + + cluster:node_cpu_use:percent = + 100 * sum by (cluster) ( + rate(node_cpu{mode!="idle"}[5m]) + ) / sum by (cluster) ( + machine_cpu_cores + ) + + ### API latency ### + + # Raw metrics are in microseconds. Convert to seconds. + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile( + 0.99, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile( + 0.9, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile( + 0.5, + sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) + ) / 1e6 + + ### Scheduling latency ### + + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 + + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = + histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = + histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = + histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 + + ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletNodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet node_exporter cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SApiserverDown + IF up{job="kubernetes"} == 0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + + # Disable for non HA kubernetes setups. + ALERT K8SApiserverDown + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) + FOR 5m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + + ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } + + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + service = "k8s", + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + ALERT K8SConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + # To catch the conntrack sysctl de-tuning when it happens + ALERT K8SConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 + FOR 10m + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "{{ $labels.job }} has too many open file descriptors", + description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + } + + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + + ALERT K8SApiServerEtcdAccessLatency + IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 + FOR 15m + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Access to etcd is slow", + description = "99th percentile latency for apiserver to access etcd is higher than 1s.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + service = "k8s", + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } + +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-k8s-rules diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests/prometheus/prometheus-k8s-service.yaml new file mode 100644 index 00000000..a558f30f --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-k8s +spec: + type: NodePort + ports: + - name: web + nodePort: 30900 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml new file mode 100644 index 00000000..110dfa42 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-apps: https +spec: + jobLabel: provider + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-https + labels: + k8s-apps: https +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: https-metrics + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-http + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + - monitoring + endpoints: + - port: http-metrics + interval: 15s + - port: http-metrics-dnsmasq + interval: 15s + - port: http-metrics-skydns + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml new file mode 100644 index 00000000..9054ea58 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: Prometheus +metadata: + name: k8s + labels: + prometheus: k8s +spec: + replicas: 2 + version: v1.5.2 + serviceMonitorSelector: + matchExpression: + - {key: k8s-apps, operator: Exists} + resources: + requests: + # 2Gi is default, but won't schedule if you don't have a node with >2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web From 676bfa16596b161fc10be995d99db23b0f9960ed Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 7 Mar 2017 13:40:23 +0100 Subject: [PATCH 002/638] fix kube-prometheus references --- docs/KOPSonAWS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md index 902dab74..0269f161 100644 --- a/docs/KOPSonAWS.md +++ b/docs/KOPSonAWS.md @@ -21,15 +21,15 @@ aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --grou ``` ## Adding kube-prometheus -Following the instructions in the [README](https://github.com/coreos/kube-prometheus/blob/master/README.md): +Following the instructions in the [README](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/README.md): Example: ```bash -git clone -b master https://github.com/coreos/kube-prometheus.git kube-prometheus-temp; -cd kube-prometheus-temp +git clone -b master https://github.com/coreos/prometheus-operator.git prometheus-operator-temp; +cd prometheus-operator-temp/contrib/kube-prometheus ./hack/cluster-monitoring/deploy kubectl -n kube-system create -f manifests/k8s/self-hosted/ cd - -rm -rf kube-prometheus-temp +rm -rf prometheus-operator-temp ``` From cf5f99793bd1e4a5a4556cff543e9d7082378b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=A1ndor=20Istv=C3=A1n=20Kr=C3=A1cser?= Date: Wed, 8 Mar 2017 09:49:35 +0100 Subject: [PATCH 003/638] Fix k8s resource example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db4f554d..f441329b 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ For the `kube-scheduler` and `kube-controller-manager` there are headless services prepared, simply add them to your running cluster: ```bash -kubectl -n kube-system create manifests/k8s/ +kubectl -n kube-system create -f manifests/k8s/ ``` > Hint: if you use this for a cluster not created with bootkube, make sure you From e69a6f69ecd133b1333181fb4b262f24fe7cc450 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 9 Mar 2017 09:16:07 +0100 Subject: [PATCH 004/638] alertmanager: use a secret for the config --- assets/alertmanager/alertmanager.yaml | 12 +++++++++++ hack/scripts/generate-configmaps.sh | 3 +++ .../alertmanager/alertmanager-config.yaml | 21 +++++-------------- 3 files changed, 20 insertions(+), 16 deletions(-) create mode 100644 assets/alertmanager/alertmanager.yaml diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml new file mode 100644 index 00000000..f08a2106 --- /dev/null +++ b/assets/alertmanager/alertmanager.yaml @@ -0,0 +1,12 @@ +global: + resolve_timeout: 5m +route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'webhook' +receivers: +- name: 'webhook' + webhook_configs: + - url: 'http://alertmanagerwh:30500/' diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh index 02ba18e9..d031b3a4 100755 --- a/hack/scripts/generate-configmaps.sh +++ b/hack/scripts/generate-configmaps.sh @@ -6,3 +6,6 @@ kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/ # Generate Dashboard ConfigMap kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml +# Generate Secret for Alertmanager config +kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml + diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index 8f7fce5b..49f8c3c4 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -1,18 +1,7 @@ apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-main data: - alertmanager.yaml: |- - global: - resolve_timeout: 5m - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'webhook' - receivers: - - name: 'webhook' - webhook_configs: - - url: 'http://alertmanagerwh:30500/' + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== +kind: Secret +metadata: + creationTimestamp: null + name: alertmanager-main From b85b5b6bcf3a17e4d850b43d6aae4aee54f2416c Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Thu, 9 Mar 2017 21:23:55 +0000 Subject: [PATCH 005/638] Account for multiple copies of kube-state-metrics This can happen if you run multiple replicas, or if you redeploy kube-state-metrics. In either case, prometheus records multiple metrics with the instance ip in, and the dashboard fails. Use aggregation functions to get sensible output in either case --- assets/grafana/deployment-dashboard.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json index 69638d15..357bd8e6 100644 --- a/assets/grafana/deployment-dashboard.json +++ b/assets/grafana/deployment-dashboard.json @@ -302,7 +302,7 @@ "targets": [ { "refId": "A", - "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "step": 600, "metric": "kube_deployment_spec_replicas" @@ -381,7 +381,7 @@ "targets": [ { "refId": "A", - "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "step": 600 } @@ -505,7 +505,7 @@ }, "targets": [ { - "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -583,7 +583,7 @@ }, "targets": [ { - "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -649,35 +649,35 @@ "steppedLine": false, "targets": [ { - "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "current replicas", "refId": "A", "step": 30 }, { - "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "available", "refId": "B", "step": 30 }, { - "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "unavailable", "refId": "C", "step": 30 }, { - "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "updated", "refId": "D", "step": 30 }, { - "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "desired", "refId": "E", From 3a5b762cef5578a58077e2db691d219c56d6fa5a Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Thu, 9 Mar 2017 21:25:27 +0000 Subject: [PATCH 006/638] Update generated files --- manifests/grafana/grafana-dashboards.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index f9757dc1..b9cfac08 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1166,7 +1166,7 @@ data: "targets": [ { "refId": "A", - "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "step": 600, "metric": "kube_deployment_spec_replicas" @@ -1245,7 +1245,7 @@ data: "targets": [ { "refId": "A", - "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "step": 600 } @@ -1369,7 +1369,7 @@ data: }, "targets": [ { - "expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -1447,7 +1447,7 @@ data: }, "targets": [ { - "expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -1513,35 +1513,35 @@ data: "steppedLine": false, "targets": [ { - "expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "current replicas", "refId": "A", "step": 30 }, { - "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "available", "refId": "B", "step": 30 }, { - "expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "unavailable", "refId": "C", "step": 30 }, { - "expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "updated", "refId": "D", "step": 30 }, { - "expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", "intervalFactor": 2, "legendFormat": "desired", "refId": "E", From 6a52b78990df8007750173f485e7dde8df3db999 Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 10 Mar 2017 10:02:05 +0000 Subject: [PATCH 007/638] Make kube-state-metrics HA by default, now the dashboards support it --- manifests/exporters/kube-state-metrics-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/exporters/kube-state-metrics-deployment.yaml index 6ef971ce..3fec8cad 100644 --- a/manifests/exporters/kube-state-metrics-deployment.yaml +++ b/manifests/exporters/kube-state-metrics-deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: name: kube-state-metrics spec: - replicas: 1 + replicas: 2 template: metadata: labels: From 51778eb36e9a461e8d7f572b66c4d0ff7fb6fd93 Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 10 Mar 2017 20:04:16 +0000 Subject: [PATCH 008/638] kube-prometheus: add resource requests dashboard This presents the resource requests vs the allocatable capacity in the cluster. --- .../grafana/resource-requests-dashboard.json | 424 ++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 assets/grafana/resource-requests-dashboard.json diff --git a/assets/grafana/resource-requests-dashboard.json b/assets/grafana/resource-requests-dashboard.json new file mode 100644 index 00000000..10732b85 --- /dev/null +++ b/assets/grafana/resource-requests-dashboard.json @@ -0,0 +1,424 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to show the resource requests vs allocatable in the cluster", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Resource Requests", + "version": 1 +} \ No newline at end of file From 3ac57cc9aa9770c448650845b201b494be935911 Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 10 Mar 2017 20:06:56 +0000 Subject: [PATCH 009/638] Update generated files --- manifests/grafana/grafana-dashboards.yaml | 425 ++++++++++++++++++++++ 1 file changed, 425 insertions(+) diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index b9cfac08..cba92d49 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -2978,6 +2978,431 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } + resource-requests-dashboard.json: |- + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.1.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to show the resource requests vs allocatable in the cluster", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Resource Requests", + "version": 1 + } kind: ConfigMap metadata: creationTimestamp: null From 52dbfc7594938d723b8ea0e8af24f2f14f47506d Mon Sep 17 00:00:00 2001 From: Mike Bryant Date: Fri, 10 Mar 2017 21:34:47 +0000 Subject: [PATCH 010/638] Add warning about Grafana HA --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index f441329b..d1becdd7 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,9 @@ This allows managing dashboards via `git` etc. and easily deploying them via CD In the future, a separate Grafana operator will support gathering dashboards from multiple ConfigMaps based on label selection. +WARNING: If you deploy multiple Grafana instances for HA, you must use session affinity. +Otherwise if pods restart the prometheus datasource ID can get out of sync between the pods, breaking the UI + ## Roadmap * Grafana Operator that dynamically discovers and deploys dashboards from ConfigMaps From f51416ba99a76225ca6db144f39c3989ab55748b Mon Sep 17 00:00:00 2001 From: Max Inden Date: Mon, 13 Mar 2017 09:26:20 +0100 Subject: [PATCH 011/638] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d1becdd7..e4ed7a7a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ which manages Prometheus servers and their configuration in a cluster. With a si * The Operator itself * The Prometheus [node_exporter](https://github.com/prometheus/node_exporter) * [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) -* The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/prometheus.md) based on which the Operator deploys a Prometheus setup +* The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheus) based on which the Operator deploys a Prometheus setup * A Prometheus configuration covering monitoring of all Kubernetes core components and exporters * A default set of alerting rules on the cluster component's health * A Grafana instance serving dashboards on cluster metrics From 55460168260a8a27bc9a4235029c5d0e5ab50fb4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 9 Mar 2017 20:08:38 +0100 Subject: [PATCH 012/638] prometheus: mount rule files based on label selector --- hack/scripts/{generate-configmaps.sh => generate-manifests.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename hack/scripts/{generate-configmaps.sh => generate-manifests.sh} (100%) diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-manifests.sh similarity index 100% rename from hack/scripts/generate-configmaps.sh rename to hack/scripts/generate-manifests.sh From 9ed63f191fe3bf5f9890750a6a660e3e0752a8a8 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 10 Mar 2017 14:15:10 +0100 Subject: [PATCH 013/638] kube-prometheus: generate manifests without kubectl For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is actually used, which is unnecessary for generating these manifests. This approach also allows further customization, such as adding labels to the generated manifests. --- assets/prometheus/rules/etcd2.rules | 2 +- hack/cluster-monitoring/deploy | 6 + .../generate-alertmanager-config-secret.sh | 11 + hack/scripts/generate-dashboards-configmap.sh | 15 + hack/scripts/generate-manifests.sh | 6 +- hack/scripts/generate-rules-configmap.sh | 18 ++ .../alertmanager/alertmanager-config.yaml | 5 +- manifests/grafana/grafana-dashboards.yaml | 22 +- .../prometheus/prometheus-k8s-rules.yaml | 264 +++++++++++------- manifests/prometheus/prometheus-k8s.yaml | 4 + 10 files changed, 238 insertions(+), 115 deletions(-) create mode 100755 hack/scripts/generate-alertmanager-config-secret.sh create mode 100755 hack/scripts/generate-dashboards-configmap.sh create mode 100755 hack/scripts/generate-rules-configmap.sh diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules index 4a38894e..10fa5e8d 100644 --- a/assets/prometheus/rules/etcd2.rules +++ b/assets/prometheus/rules/etcd2.rules @@ -29,7 +29,7 @@ ALERT HighNumberOfFailedHTTPRequests # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 FOR 5m LABELS { diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 9ad91eb0..bcb6a42e 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml kctl apply -f manifests/alertmanager/alertmanager-config.yaml kctl apply -f manifests/alertmanager/alertmanager-service.yaml +# unfortunately statefulsets cannot be changed except for their replica count +# so we need to make sure that the rule files are created before we create the +# prometheus resource so it can properly discover the rule files when creating +# the statefulset +sleep 5 + # `kubectl apply` is currently not working for third party resources so we are # using `kubectl create` here for the time being. # (https://github.com/kubernetes/kubernetes/issues/29542) diff --git a/hack/scripts/generate-alertmanager-config-secret.sh b/hack/scripts/generate-alertmanager-config-secret.sh new file mode 100755 index 00000000..b0b4aaef --- /dev/null +++ b/hack/scripts/generate-alertmanager-config-secret.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-main +data: + alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0) +EOF + diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh new file mode 100755 index 00000000..6e21600e --- /dev/null +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards +data: +EOF + +for f in assets/grafana/* +do + echo " $(basename $f): |+" + cat $f | sed "s/^/ /g" +done diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index d031b3a4..bf5f42fa 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -1,11 +1,11 @@ #!/bin/bash # Generate Alert Rules ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml +hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap -kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml +hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml # Generate Secret for Alertmanager config -kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml +hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh new file mode 100755 index 00000000..b8e00fef --- /dev/null +++ b/hack/scripts/generate-rules-configmap.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + labels: + role: prometheus-rulefiles + prometheus: k8s +data: +EOF + +for f in assets/prometheus/rules/*.rules +do + echo " $(basename $f): |+" + cat $f | sed "s/^/ /g" +done diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index 49f8c3c4..eee36b33 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -1,7 +1,6 @@ apiVersion: v1 -data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== kind: Secret metadata: - creationTimestamp: null name: alertmanager-main +data: + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index cba92d49..15244d61 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1,6 +1,9 @@ apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards data: - all-nodes-dashboard.json: | + all-nodes-dashboard.json: |+ { "dashboard": { @@ -861,7 +864,7 @@ data: ], "overwrite": true } - deployment-dashboard.json: |- + deployment-dashboard.json: |+ { "dashboard": { "__inputs": [ @@ -1678,8 +1681,7 @@ data: } ], "overwrite": true - } - kubernetes-pods-dashboard.json: | + } kubernetes-pods-dashboard.json: |+ { "dashboard": { "__inputs": [ @@ -2089,7 +2091,7 @@ data: ], "overwrite": true } - node-dashboard.json: | + node-dashboard.json: |+ { "dashboard": { @@ -2970,7 +2972,7 @@ data: ], "overwrite": true } - prometheus-datasource.json: | + prometheus-datasource.json: |+ { "access": "proxy", "basicAuth": false, @@ -2978,7 +2980,7 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } - resource-requests-dashboard.json: |- + resource-requests-dashboard.json: |+ { "__inputs": [ { @@ -3402,8 +3404,4 @@ data: "timezone": "browser", "title": "Resource Requests", "version": 1 - } -kind: ConfigMap -metadata: - creationTimestamp: null - name: grafana-dashboards + } \ No newline at end of file diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 08f6dddc..7327b0db 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -1,62 +1,138 @@ apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + labels: + role: prometheus-rulefiles + prometheus: k8s data: - etcd2.rules: "### General cluster availability ###\n\n# alert if another failed - peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"} - == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity - = \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n - \ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n - \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to - an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n - \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n - \ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) > - 0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n - \ summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance - {{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP - endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n - \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m])) - \n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) - > 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance - {{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT - HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", - code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) - > 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses - on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile - of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99, - rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS - {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP - requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP - requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts - ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert - if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n - \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS - {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors - soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance - }} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors - are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m], - 3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"file descriptors soon exhausted\",\n description = \"{{ - $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors - soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed - proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h]) - > 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary - = \"a high number of failed proposals within the etcd cluster are happening\",\n - \ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }} - proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts - ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT - HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) - > 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n - \ summary = \"high fsync durations\",\n description = \"ectd instance {{ - $labels.instance }} fync durations are high\",\n }\n" + etcd2.rules: |+ + ### General cluster availability ### + + # alert if another failed peer will result in an unavailable cluster + ALERT InsufficientPeers + IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Etcd cluster small", + description = "If one more etcd peer goes down the cluster will be unavailable", + } + + ### HTTP requests alerts ### + + # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if 50% of requests get a 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + ### File descriptor alerts ### + + instance:fd_utilization = process_open_fds / process_max_fds + + # alert if file descriptors are likely to exhaust within the next 4 hours + ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + + # alert if file descriptors are likely to exhaust within the next hour + ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + + ### etcd proposal alerts ### + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of failed proposals within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + ### etcd disk io latency alerts ### + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "ectd instance {{ $labels.instance }} fync durations are high", + } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. - + ### Container resources ### - + cluster_namespace_controller_pod_container:spec_memory_limit_bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -65,7 +141,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:spec_cpu_shares = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -74,7 +150,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:cpu_usage:rate = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -85,7 +161,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -94,7 +170,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_working_set:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -103,7 +179,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_rss:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -112,7 +188,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_cache:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -121,7 +197,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:disk_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -130,7 +206,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_pagefaults:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( @@ -141,7 +217,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_oom:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( @@ -152,39 +228,39 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + ### Cluster resources ### - + cluster:memory_allocation:percent = 100 * sum by (cluster) ( container_spec_memory_limit_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) - + cluster:memory_used:percent = 100 * sum by (cluster) ( container_memory_usage_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) - + cluster:cpu_allocation:percent = 100 * sum by (cluster) ( container_spec_cpu_shares{pod_name!=""} ) / sum by (cluster) ( container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores ) - + cluster:node_cpu_use:percent = 100 * sum by (cluster) ( rate(node_cpu{mode!="idle"}[5m]) ) / sum by (cluster) ( machine_cpu_cores ) - + ### API latency ### - + # Raw metrics are in microseconds. Convert to seconds. cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = histogram_quantile( @@ -201,30 +277,30 @@ data: 0.5, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 - + ### Scheduling latency ### - + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - + ALERT K8SNodeDown IF up{job="kubelet"} == 0 FOR 1h @@ -236,7 +312,7 @@ data: summary = "Kubelet cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", } - + ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h @@ -248,7 +324,7 @@ data: summary = "Node status is NotReady", description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } - + ALERT K8SManyNodesNotReady IF count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 @@ -267,7 +343,7 @@ data: summary = "Many K8s nodes are Not Ready", description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", } - + ALERT K8SKubeletNodeExporterDown IF up{job="node-exporter"} == 0 FOR 15m @@ -279,7 +355,7 @@ data: summary = "Kubelet node_exporter cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", } - + ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h @@ -291,7 +367,7 @@ data: summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", } - + ALERT K8SApiserverDown IF up{job="kubernetes"} == 0 FOR 15m @@ -303,7 +379,7 @@ data: summary = "API server unreachable", description = "An API server could not be scraped.", } - + # Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) @@ -316,7 +392,7 @@ data: summary = "API server unreachable", description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", } - + ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m @@ -328,7 +404,7 @@ data: summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - + ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m @@ -340,7 +416,7 @@ data: summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", } - + ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 FOR 10m @@ -352,7 +428,7 @@ data: summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } - + ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 LABELS { @@ -363,7 +439,7 @@ data: summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } - + # To catch the conntrack sysctl de-tuning when it happens ALERT K8SConntrackTuningMissing IF node_nf_conntrack_udp_timeout > 10 @@ -376,7 +452,7 @@ data: summary = "Node does not have the correct conntrack tunings", description = "Nodes keep un-setting the correct tunings, investigate when it happens.", } - + ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m @@ -388,7 +464,7 @@ data: summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } - + ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m @@ -400,7 +476,7 @@ data: summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } - + # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. ALERT K8SApiServerLatency @@ -417,7 +493,7 @@ data: summary = "Kubernetes apiserver latency is high", description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", } - + ALERT K8SApiServerEtcdAccessLatency IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 FOR 15m @@ -429,7 +505,7 @@ data: summary = "Access to etcd is slow", description = "99th percentile latency for apiserver to access etcd is higher than 1s.", } - + ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { @@ -440,8 +516,4 @@ data: summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } - -kind: ConfigMap -metadata: - creationTimestamp: null - name: prometheus-k8s-rules + diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 9054ea58..23156650 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -10,6 +10,10 @@ spec: serviceMonitorSelector: matchExpression: - {key: k8s-apps, operator: Exists} + ruleSelector: + matchLabels: + role: prometheus-rulefiles + prometheus: k8s resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi From d9086e9875af6e01957b8a73fb46b066b238ff65 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 13 Mar 2017 12:08:30 +0100 Subject: [PATCH 014/638] kube-prometheus: remove duplication in grafana dashboards Datasource links were duplicated in the grafana dashboads. This now also allows exporting grafana dashboards from the UI and just dropping them into the assets directory and they will be wrapped by the manifest generation script. --- assets/grafana/all-nodes-dashboard.json | 1698 +++-- assets/grafana/deployment-dashboard.json | 1611 ++-- assets/grafana/kubernetes-pods-dashboard.json | 793 +- assets/grafana/node-dashboard.json | 1738 +++-- .../grafana/resource-requests-dashboard.json | 840 +-- hack/scripts/generate-dashboards-configmap.sh | 8 +- hack/scripts/wrap-dashboard.sh | 17 +- manifests/grafana/grafana-dashboards.yaml | 6698 +++++++++-------- 8 files changed, 6689 insertions(+), 6714 deletions(-) diff --git a/assets/grafana/all-nodes-dashboard.json b/assets/grafana/all-nodes-dashboard.json index 7a7c2bde..fd36d882 100644 --- a/assets/grafana/all-nodes-dashboard.json +++ b/assets/grafana/all-nodes-dashboard.json @@ -1,860 +1,848 @@ { - "dashboard": -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.1.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ + "__inputs": [ { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 4, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "All Nodes", - "version": 1 -}, - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 } diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json index 357bd8e6..7a59db7f 100644 --- a/assets/grafana/deployment-dashboard.json +++ b/assets/grafana/deployment-dashboard.json @@ -1,817 +1,806 @@ { - "dashboard": { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "3.1.1" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], - "id": null, - "title": "Deployment", - "tags": [], - "style": "dark", - "timezone": "browser", - "editable": true, - "hideControls": false, - "sharedCrosshair": true, - "rows": [ - { - "collapse": false, - "editable": true, - "height": "200px", - "panels": [ - { - "title": "CPU", - "error": false, - "span": 4, - "editable": true, - "type": "singlestat", - "isNew": true, - "id": 8, - "targets": [ - { - "refId": "A", - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "step": 600 - } - ], - "links": [], - "datasource": "${DS_PROMETHEUS}", - "maxDataPoints": 100, - "interval": null, - "cacheTimeout": null, - "format": "none", - "prefix": "", - "postfix": "cores", - "nullText": null, - "valueMaps": [ - { - "value": "null", - "op": "=", - "text": "N/A" - } - ], - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "rangeMaps": [ - { - "from": "null", - "to": "null", - "text": "N/A" - } - ], - "mappingType": 1, - "nullPointMode": "connected", - "valueName": "avg", - "prefixFontSize": "50%", - "valueFontSize": "110%", - "postfixFontSize": "50%", - "thresholds": "", - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "sparkline": { - "show": true, - "full": false, - "lineColor": "rgb(31, 120, 193)", - "fillColor": "rgba(31, 118, 189, 0.18)" - }, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, - { - "title": "Memory", - "error": false, - "span": 4, - "editable": true, - "type": "singlestat", - "isNew": true, - "id": 9, - "targets": [ - { - "refId": "A", - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "step": 600 - } - ], - "links": [], - "datasource": "${DS_PROMETHEUS}", - "maxDataPoints": 100, - "interval": null, - "cacheTimeout": null, - "format": "none", - "prefix": "", - "postfix": "GB", - "nullText": null, - "valueMaps": [ - { - "value": "null", - "op": "=", - "text": "N/A" - } - ], - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "rangeMaps": [ - { - "from": "null", - "to": "null", - "text": "N/A" - } - ], - "mappingType": 1, - "nullPointMode": "connected", - "valueName": "avg", - "prefixFontSize": "80%", - "valueFontSize": "110%", - "postfixFontSize": "50%", - "thresholds": "", - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "sparkline": { - "show": true, - "full": false, - "lineColor": "rgb(31, 120, 193)", - "fillColor": "rgba(31, 118, 189, 0.18)" - }, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, - { - "title": "Network", - "error": false, - "span": 4, - "editable": true, - "type": "singlestat", - "isNew": true, - "id": 7, - "targets": [ - { - "refId": "A", - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "step": 600 - } - ], - "links": [], - "datasource": "${DS_PROMETHEUS}", - "maxDataPoints": 100, - "interval": null, - "cacheTimeout": null, - "format": "Bps", - "prefix": "", - "postfix": "", - "nullText": null, - "valueMaps": [ - { - "value": "null", - "op": "=", - "text": "N/A" - } - ], - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "rangeMaps": [ - { - "from": "null", - "to": "null", - "text": "N/A" - } - ], - "mappingType": 1, - "nullPointMode": "connected", - "valueName": "avg", - "prefixFontSize": "50%", - "valueFontSize": "80%", - "postfixFontSize": "50%", - "thresholds": "", - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "sparkline": { - "show": true, - "full": false, - "lineColor": "rgb(31, 120, 193)", - "fillColor": "rgba(31, 118, 189, 0.18)" - }, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": false, - "thresholdLabels": false - } - } - ], - "title": "Row", - "showTitle": false - }, - { - "title": "New row", - "height": "100px", - "editable": true, - "collapse": false, - "panels": [ - { - "title": "Desired Replicas", - "error": false, - "span": 3, - "editable": true, - "type": "singlestat", - "isNew": true, - "id": 5, - "targets": [ - { - "refId": "A", - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "step": 600, - "metric": "kube_deployment_spec_replicas" - } - ], - "links": [], - "datasource": "${DS_PROMETHEUS}", - "maxDataPoints": 100, - "interval": null, - "cacheTimeout": null, - "format": "none", - "prefix": "", - "postfix": "", - "nullText": null, - "valueMaps": [ - { - "value": "null", - "op": "=", - "text": "N/A" - } - ], - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "rangeMaps": [ - { - "from": "null", - "to": "null", - "text": "N/A" - } - ], - "mappingType": 1, - "nullPointMode": "connected", - "valueName": "avg", - "prefixFontSize": "50%", - "valueFontSize": "80%", - "postfixFontSize": "50%", - "thresholds": "", - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "sparkline": { - "show": false, - "full": false, - "lineColor": "rgb(31, 120, 193)", - "fillColor": "rgba(31, 118, 189, 0.18)" - }, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": false, - "thresholdLabels": false - }, - "decimals": null - }, - { - "title": "Available Replicas", - "error": false, - "span": 3, - "editable": true, - "type": "singlestat", - "isNew": true, - "id": 6, - "targets": [ - { - "refId": "A", - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "step": 600 - } - ], - "links": [], - "datasource": "${DS_PROMETHEUS}", - "maxDataPoints": 100, - "interval": null, - "cacheTimeout": null, - "format": "none", - "prefix": "", - "postfix": "", - "nullText": null, - "valueMaps": [ - { - "value": "null", - "op": "=", - "text": "N/A" - } - ], - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "rangeMaps": [ - { - "from": "null", - "to": "null", - "text": "N/A" - } - ], - "mappingType": 1, - "nullPointMode": "connected", - "valueName": "avg", - "prefixFontSize": "50%", - "valueFontSize": "80%", - "postfixFontSize": "50%", - "thresholds": "", - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "sparkline": { - "show": false, - "full": false, - "lineColor": "rgb(31, 120, 193)", - "fillColor": "rgba(31, 118, 189, 0.18)" - }, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ] - }, - { - "collapse": false, - "editable": true, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false, - "hideZero": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "transparent": false - } - ], - "title": "New row", - "showTitle": false - } - ], - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "deployment", - "type": "query", - "useTags": false - } - ] - }, - "annotations": { - "list": [] - }, - "schemaVersion": 12, - "version": 2, - "links": [], - "gnetId": null -}, - "inputs": [ + "__inputs": [ { + "description": "", + "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" + "pluginName": "Prometheus", + "type": "datasource" } ], - "overwrite": true -} \ No newline at end of file + "__requires": [ + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 2 +} diff --git a/assets/grafana/kubernetes-pods-dashboard.json b/assets/grafana/kubernetes-pods-dashboard.json index 035da015..24036f3e 100644 --- a/assets/grafana/kubernetes-pods-dashboard.json +++ b/assets/grafana/kubernetes-pods-dashboard.json @@ -1,409 +1,398 @@ { - "dashboard": { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 10 - }, - { - "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_requested_memory_bytes", - "refId": "B", - "step": 20 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 26 - }, - "inputs": [ + "__inputs": [ { + "description": "", + "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" + "pluginName": "Prometheus", + "type": "datasource" } ], - "overwrite": true + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_requested_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 } diff --git a/assets/grafana/node-dashboard.json b/assets/grafana/node-dashboard.json index 78a5bb37..9a831aaa 100644 --- a/assets/grafana/node-dashboard.json +++ b/assets/grafana/node-dashboard.json @@ -1,880 +1,868 @@ { - "dashboard": -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.1.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ + "__inputs": [ { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 4 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 4 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 4 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 1 -}, - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 } diff --git a/assets/grafana/resource-requests-dashboard.json b/assets/grafana/resource-requests-dashboard.json index 10732b85..e34315b8 100644 --- a/assets/grafana/resource-requests-dashboard.json +++ b/assets/grafana/resource-requests-dashboard.json @@ -1,424 +1,424 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.1.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to show the resource requests vs allocatable in the cluster", - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "300", - "panels": [ + "__inputs": [ { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Resource Requests", - "version": 1 -} \ No newline at end of file + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to show the resource requests vs allocatable in the cluster", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Resource Requests", + "version": 1 +} diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index 6e21600e..78ad27ac 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -8,7 +8,13 @@ metadata: data: EOF -for f in assets/grafana/* +for f in assets/grafana/*-dashboard.json +do + echo " $(basename $f): |+" + hack/scripts/wrap-dashboard.sh $f | sed "s/^/ /g" +done + +for f in assets/grafana/*-datasource.json do echo " $(basename $f): |+" cat $f | sed "s/^/ /g" diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh index 1b514387..8eacdf81 100755 --- a/hack/scripts/wrap-dashboard.sh +++ b/hack/scripts/wrap-dashboard.sh @@ -8,12 +8,10 @@ # has error. # * Download dashboard specification as JSON file in Grafana: # Share -> Export -> Save to file. -# * Wrap dashboard specification to make it digestable by kube-prometheus: -# ./hack/scripts/wrap-dashboard.sh Nodes-1488465802729.json -# * Replace dashboard specification: +# * Drop dashboard specification in assets folder: # mv Nodes-1488465802729.json assets/grafana/node-dashboard.json # * Regenerate Grafana configmap: -# ./hack/scripts/generate-configmaps.sh +# ./hack/scripts/generate-manifests.sh # * Apply new configmap: # kubectl -n monitoring apply -f manifests/grafana/grafana-cm.yaml @@ -22,17 +20,16 @@ if [ "$#" -ne 1 ]; then exit 1 fi -json=$1 -temp=$(tempfile -m 0644) +dashboardjson=$1 -cat >> $temp <> $temp +cat $dashboardjson -cat >> $temp <> $temp < Date: Wed, 15 Mar 2017 17:30:51 +0100 Subject: [PATCH 015/638] kube-prometheus: timing issue fixed with prometheus-watcher --- hack/cluster-monitoring/deploy | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index bcb6a42e..9ad91eb0 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -32,12 +32,6 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml kctl apply -f manifests/alertmanager/alertmanager-config.yaml kctl apply -f manifests/alertmanager/alertmanager-service.yaml -# unfortunately statefulsets cannot be changed except for their replica count -# so we need to make sure that the rule files are created before we create the -# prometheus resource so it can properly discover the rule files when creating -# the statefulset -sleep 5 - # `kubectl apply` is currently not working for third party resources so we are # using `kubectl create` here for the time being. # (https://github.com/kubernetes/kubernetes/issues/29542) From b8b3f99a4dc5258a43b5ff5bdf19795013fb22e7 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 17 Mar 2017 16:09:34 +0100 Subject: [PATCH 016/638] *: bump version to v0.7.0 --- manifests/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator.yaml index 6c8030eb..06ddf799 100644 --- a/manifests/prometheus-operator.yaml +++ b/manifests/prometheus-operator.yaml @@ -13,7 +13,7 @@ spec: spec: containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.6.0 + image: quay.io/coreos/prometheus-operator:v0.7.0 args: - "--kubelet-object=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From bf67031b5c9b208935d06ce661a227e9e32324db Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 22 Mar 2017 19:36:17 +0100 Subject: [PATCH 017/638] kube-prometheus: add RBAC resources --- hack/cluster-monitoring/deploy | 5 ++- hack/cluster-monitoring/teardown | 2 +- ...metheus-operator-cluster-role-binding.yaml | 12 ++++++ .../prometheus-operator-cluster-role.yaml | 42 +++++++++++++++++++ .../prometheus-operator-service-account.yaml | 4 ++ .../prometheus-operator.yaml | 5 ++- .../prometheus-cluster-role-binding.yaml | 12 ++++++ .../prometheus/prometheus-cluster-role.yaml | 16 +++++++ .../prometheus-k8s-service-account.yaml | 4 ++ manifests/prometheus/prometheus-k8s.yaml | 1 + 10 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml create mode 100644 manifests/prometheus-operator/prometheus-operator-cluster-role.yaml create mode 100644 manifests/prometheus-operator/prometheus-operator-service-account.yaml rename manifests/{ => prometheus-operator}/prometheus-operator.yaml (74%) create mode 100644 manifests/prometheus/prometheus-cluster-role-binding.yaml create mode 100644 manifests/prometheus/prometheus-cluster-role.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-account.yaml diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 9ad91eb0..098af134 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -14,7 +14,7 @@ kctl() { kubectl --namespace "$NAMESPACE" "$@" } -kctl apply -f manifests/prometheus-operator.yaml +kctl apply -f manifests/prometheus-operator # Wait for TPRs to be ready. printf "Waiting for Operator to register third party objects..." @@ -28,6 +28,9 @@ kctl apply -f manifests/grafana kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml +kctl apply -f manifests/prometheus/prometheus-cluster-role-binding.yaml +kctl apply -f manifests/prometheus/prometheus-cluster-role.yaml +kctl apply -f manifests/prometheus/prometheus-k8s-service-account.yaml kctl apply -f manifests/alertmanager/alertmanager-config.yaml kctl apply -f manifests/alertmanager/alertmanager-service.yaml diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index 45ae61ed..e5e0d9a6 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -20,5 +20,5 @@ kctl delete -f manifests/alertmanager # Hack: wait a bit to let the controller delete the deployed Prometheus server. sleep 5 -kctl delete -f manifests/prometheus-operator.yaml +kctl delete -f manifests/prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml new file mode 100644 index 00000000..bd69276f --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRoleBinding +metadata: + name: prometheus-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-operator +subjects: +- kind: ServiceAccount + name: prometheus-operator + namespace: default diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml new file mode 100644 index 00000000..c7bebb9d --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -0,0 +1,42 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRole +metadata: + name: prometheus-operator +rules: +- apiGroups: + - extensions + resources: + - thirdpartyresources + verbs: + - create +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + - servicemonitors + verbs: + - "*" +- apiGroups: + - apps + resources: + - statefulsets + verbs: ["*"] +- apiGroups: [""] + resources: + - configmaps + - secrets + verbs: ["*"] +- apiGroups: [""] + resources: + - pods + verbs: ["list", "delete"] +- apiGroups: [""] + resources: + - services + - endpoints + verbs: ["get", "create", "update"] +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] diff --git a/manifests/prometheus-operator/prometheus-operator-service-account.yaml b/manifests/prometheus-operator/prometheus-operator-service-account.yaml new file mode 100644 index 00000000..38d18cce --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-operator diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml similarity index 74% rename from manifests/prometheus-operator.yaml rename to manifests/prometheus-operator/prometheus-operator.yaml index 06ddf799..06232af0 100644 --- a/manifests/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -11,12 +11,13 @@ spec: labels: operator: prometheus spec: + serviceAccountName: prometheus-operator containers: - name: prometheus-operator image: quay.io/coreos/prometheus-operator:v0.7.0 args: - - "--kubelet-object=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + - "--kubelet-object=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" resources: requests: cpu: 100m diff --git a/manifests/prometheus/prometheus-cluster-role-binding.yaml b/manifests/prometheus/prometheus-cluster-role-binding.yaml new file mode 100644 index 00000000..e337527f --- /dev/null +++ b/manifests/prometheus/prometheus-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml new file mode 100644 index 00000000..458c6158 --- /dev/null +++ b/manifests/prometheus/prometheus-cluster-role.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-service-account.yaml b/manifests/prometheus/prometheus-k8s-service-account.yaml new file mode 100644 index 00000000..58d5342d --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-k8s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 23156650..a8a14910 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -7,6 +7,7 @@ metadata: spec: replicas: 2 version: v1.5.2 + serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: - {key: k8s-apps, operator: Exists} From bbd5684b43638b199ee3057f74e7994de01f14f7 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 23 Mar 2017 13:39:32 +0100 Subject: [PATCH 018/638] kube-prometheus: add RBAC roles for kube-state-metrics --- ...kube-state-metrics-cluster-role-binding.yaml | 12 ++++++++++++ .../kube-state-metrics-cluster-role.yaml | 17 +++++++++++++++++ .../kube-state-metrics-deployment.yaml | 1 + .../kube-state-metrics-service-account.yaml | 4 ++++ 4 files changed, 34 insertions(+) create mode 100644 manifests/exporters/kube-state-metrics-cluster-role-binding.yaml create mode 100644 manifests/exporters/kube-state-metrics-cluster-role.yaml create mode 100644 manifests/exporters/kube-state-metrics-service-account.yaml diff --git a/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml b/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml new file mode 100644 index 00000000..d7e421e6 --- /dev/null +++ b/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/manifests/exporters/kube-state-metrics-cluster-role.yaml b/manifests/exporters/kube-state-metrics-cluster-role.yaml new file mode 100644 index 00000000..fdbd41db --- /dev/null +++ b/manifests/exporters/kube-state-metrics-cluster-role.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1alpha1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: [""] + resources: + - nodes + - pods + - resourcequotas + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/exporters/kube-state-metrics-deployment.yaml index 3fec8cad..4a4e9ffd 100644 --- a/manifests/exporters/kube-state-metrics-deployment.yaml +++ b/manifests/exporters/kube-state-metrics-deployment.yaml @@ -9,6 +9,7 @@ spec: labels: app: kube-state-metrics spec: + serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics image: gcr.io/google_containers/kube-state-metrics:v0.4.1 diff --git a/manifests/exporters/kube-state-metrics-service-account.yaml b/manifests/exporters/kube-state-metrics-service-account.yaml new file mode 100644 index 00000000..99779352 --- /dev/null +++ b/manifests/exporters/kube-state-metrics-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics From 59bcbb70b3d334e25141bc3bc0932c75f4ab1f1d Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 24 Mar 2017 10:27:22 +0100 Subject: [PATCH 019/638] grafana-watcher: use official documented API --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 5a70df49..cad15152 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -31,7 +31,7 @@ spec: memory: 300Mi cpu: 300m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:latest + image: quay.io/coreos/grafana-watcher:v0.0.1 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://admin:admin@localhost:3000' From 75ce46eef3b0014208a8f93cfa7c116195ecc571 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 27 Mar 2017 10:31:21 +0200 Subject: [PATCH 020/638] grafana-watcher: revert to import endpoint The dashboard create endpoint behaves slightly different when templating is used. The import API works exactly the same as the front-end import functionality. --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index cad15152..b727561c 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -31,7 +31,7 @@ spec: memory: 300Mi cpu: 300m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.1 + image: quay.io/coreos/grafana-watcher:v0.0.2 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://admin:admin@localhost:3000' From ce6b7009f8db7ba76d6dc01022f682b32f7083df Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Mon, 27 Mar 2017 14:06:28 +0200 Subject: [PATCH 021/638] kube-prometheus: fix rolebinding This entire thing is built in the monitoring namespace but the rolebinding was on default. This caused the operator to never launch. --- .../prometheus-operator-cluster-role-binding.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml index bd69276f..5ea7b9c5 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml @@ -9,4 +9,4 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus-operator - namespace: default + namespace: monitoring From 86e585491883065e37768ad293149edd6f3bccc7 Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Mon, 27 Mar 2017 14:50:22 +0200 Subject: [PATCH 022/638] kube-prometheus: put in kube-system These manifests should live in kube-system; otherwise the label selector won't work. --- hack/cluster-monitoring/self-hosted-deploy | 2 +- hack/cluster-monitoring/self-hosted-teardown | 2 +- manifests/k8s/self-hosted/kube-controller-manager.yaml | 1 + manifests/k8s/self-hosted/kube-dns.yaml | 1 + manifests/k8s/self-hosted/kube-scheduler.yaml | 1 + 5 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hack/cluster-monitoring/self-hosted-deploy b/hack/cluster-monitoring/self-hosted-deploy index a25f7ed3..7cbce37d 100755 --- a/hack/cluster-monitoring/self-hosted-deploy +++ b/hack/cluster-monitoring/self-hosted-deploy @@ -2,5 +2,5 @@ hack/cluster-monitoring/deploy -kubectl --namespace=kube-system apply -f manifests/k8s/self-hosted +kubectl apply -f manifests/k8s/self-hosted diff --git a/hack/cluster-monitoring/self-hosted-teardown b/hack/cluster-monitoring/self-hosted-teardown index 05fd625a..f9d7da9f 100755 --- a/hack/cluster-monitoring/self-hosted-teardown +++ b/hack/cluster-monitoring/self-hosted-teardown @@ -2,5 +2,5 @@ hack/cluster-monitoring/teardown -kubectl --namespace=kube-system delete -f manifests/k8s/self-hosted +kubectl delete -f manifests/k8s/self-hosted diff --git a/manifests/k8s/self-hosted/kube-controller-manager.yaml b/manifests/k8s/self-hosted/kube-controller-manager.yaml index 2f22a6f2..a2983101 100644 --- a/manifests/k8s/self-hosted/kube-controller-manager.yaml +++ b/manifests/k8s/self-hosted/kube-controller-manager.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-controller-manager-prometheus-discovery labels: k8s-app: kube-controller-manager diff --git a/manifests/k8s/self-hosted/kube-dns.yaml b/manifests/k8s/self-hosted/kube-dns.yaml index 36d9a0ad..e0327714 100644 --- a/manifests/k8s/self-hosted/kube-dns.yaml +++ b/manifests/k8s/self-hosted/kube-dns.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-dns-prometheus-discovery labels: k8s-app: kube-dns diff --git a/manifests/k8s/self-hosted/kube-scheduler.yaml b/manifests/k8s/self-hosted/kube-scheduler.yaml index 331998fe..0fe05dd7 100644 --- a/manifests/k8s/self-hosted/kube-scheduler.yaml +++ b/manifests/k8s/self-hosted/kube-scheduler.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-scheduler-prometheus-discovery labels: k8s-app: kube-scheduler From a2ca552f9793011c2e5bd6a4fc36b91b5a1bf169 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 6 Apr 2017 13:41:27 +0200 Subject: [PATCH 023/638] Documentation: Prometheus requires RBAC role for apiserver metrics --- manifests/prometheus/prometheus-cluster-role.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml index 458c6158..0a8526d5 100644 --- a/manifests/prometheus/prometheus-cluster-role.yaml +++ b/manifests/prometheus/prometheus-cluster-role.yaml @@ -14,3 +14,5 @@ rules: resources: - configmaps verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] From 066b04322d62605f203637574ffbf645597b5fca Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 6 Apr 2017 15:24:22 +0200 Subject: [PATCH 024/638] grafana-watcher: allow credentials from env variable --- .../generate-grafana-credentials-secret.sh | 20 +++++++++++++ hack/scripts/generate-manifests.sh | 3 ++ manifests/grafana/grafana-credentials.yaml | 7 +++++ manifests/grafana/grafana-deployment.yaml | 30 ++++++++++++++++--- 4 files changed, 56 insertions(+), 4 deletions(-) create mode 100755 hack/scripts/generate-grafana-credentials-secret.sh create mode 100644 manifests/grafana/grafana-credentials.yaml diff --git a/hack/scripts/generate-grafana-credentials-secret.sh b/hack/scripts/generate-grafana-credentials-secret.sh new file mode 100755 index 00000000..e877b080 --- /dev/null +++ b/hack/scripts/generate-grafana-credentials-secret.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 user password" + exit 1 +fi + +user=$1 +password=$2 + +cat <<-EOF +apiVersion: v1 +kind: Secret +metadata: + name: grafana-credentials +data: + user: $(echo -n ${user} | base64 --wrap=0) + password: $(echo -n ${password} | base64 --wrap=0) +EOF + diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index bf5f42fa..280bc121 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -6,6 +6,9 @@ hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-r # Generate Dashboard ConfigMap hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml +# Generate Grafana Credentials Secret +hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml + # Generate Secret for Alertmanager config hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml diff --git a/manifests/grafana/grafana-credentials.yaml b/manifests/grafana/grafana-credentials.yaml new file mode 100644 index 00000000..c3da1b63 --- /dev/null +++ b/manifests/grafana/grafana-credentials.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-credentials +data: + user: YWRtaW4= + password: YWRtaW4= diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index b727561c..e83d265d 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -17,6 +17,16 @@ spec: value: "true" - name: GF_AUTH_ANONYMOUS_ENABLED value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password volumeMounts: - name: grafana-storage mountPath: /var/grafana-storage @@ -28,13 +38,25 @@ spec: memory: 100Mi cpu: 100m limits: - memory: 300Mi - cpu: 300m + memory: 200Mi + cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.2 + image: quay.io/coreos/grafana-watcher:v0.0.3 + imagePullPolicy: Never args: - '--watch-dir=/var/grafana-dashboards' - - '--grafana-url=http://admin:admin@localhost:3000' + - '--grafana-url=http://localhost:3000' + env: + - name: GRAFANA_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GRAFANA_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password volumeMounts: - name: grafana-dashboards mountPath: /var/grafana-dashboards From 5f05aa7e0732645ad505d7cbb2f30af49af33313 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 7 Apr 2017 10:58:17 +0200 Subject: [PATCH 025/638] cut 0.8.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 06232af0..524dc292 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.7.0 + image: quay.io/coreos/prometheus-operator:v0.8.0 args: - "--kubelet-object=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From d743988104f2b962681e576c89be08cd8db82729 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 7 Apr 2017 13:52:10 +0200 Subject: [PATCH 026/638] kube-prometheus: remove grafana never pull policy --- manifests/grafana/grafana-deployment.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index e83d265d..b133a7fc 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -42,7 +42,6 @@ spec: cpu: 200m - name: grafana-watcher image: quay.io/coreos/grafana-watcher:v0.0.3 - imagePullPolicy: Never args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From 78f543495dac4c76b209349474ca8ad7461404fd Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Mon, 10 Apr 2017 16:28:11 +0200 Subject: [PATCH 027/638] Increase memory limits to avoid OOMKilled Related to https://github.com/kubernetes/kube-state-metrics/issues/112#issuecomment-292965833 --- manifests/exporters/kube-state-metrics-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/exporters/kube-state-metrics-deployment.yaml index 4a4e9ffd..8e2a2893 100644 --- a/manifests/exporters/kube-state-metrics-deployment.yaml +++ b/manifests/exporters/kube-state-metrics-deployment.yaml @@ -18,9 +18,9 @@ spec: containerPort: 8080 resources: requests: - memory: 30Mi + memory: 100Mi cpu: 100m limits: - memory: 50Mi + memory: 200Mi cpu: 200m From c2daa0346c10e215816584619896152d81bdcdbf Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 13 Apr 2017 16:43:11 +0200 Subject: [PATCH 028/638] cut 0.8.1 --- .../prometheus-operator.yaml | 2 +- .../prometheus-k8s-servicemonitors.yaml | 23 ------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 524dc292..7d11df9b 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.8.0 + image: quay.io/coreos/prometheus-operator:v0.8.1 args: - "--kubelet-object=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" diff --git a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml index 110dfa42..8b1f2b4a 100644 --- a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml +++ b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml @@ -24,29 +24,6 @@ spec: --- apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor -metadata: - name: k8s-apps-https - labels: - k8s-apps: https -spec: - jobLabel: k8s-app - selector: - matchExpressions: - - {key: k8s-app, operator: Exists} - namespaceSelector: - matchNames: - - kube-system - endpoints: - - port: https-metrics - interval: 15s - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor metadata: name: k8s-apps-http labels: From c3cb0ac749d1549fa62c296b59670f8392caa939 Mon Sep 17 00:00:00 2001 From: Jeff Cook Date: Fri, 14 Apr 2017 23:25:12 -0600 Subject: [PATCH 029/638] Ensure that grafana-credentials are created first. ... Without this, a race condition may occur, preventing grafana-watcher from logging in to grafana. --- hack/cluster-monitoring/deploy | 1 + 1 file changed, 1 insertion(+) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 098af134..3c3a5dcd 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -24,6 +24,7 @@ until kctl get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" kctl apply -f manifests/exporters +kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml From f715d855300dc23dd1d0bbbdb5bcbb783bdfae4d Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Tue, 18 Apr 2017 15:26:59 +0200 Subject: [PATCH 030/638] cut new grafana-watcher release --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index b133a7fc..b2ae3f8f 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.3 + image: quay.io/coreos/grafana-watcher:v0.0.4 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From 2d962a310d5c1db955f83d7d64d26bba1269d9f5 Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Thu, 20 Apr 2017 12:54:20 +0200 Subject: [PATCH 031/638] setup service monitor for prometheus and alertanager --- .../alertmanager/alertmanager-service.yaml | 2 ++ .../prometheus/prometheus-k8s-service.yaml | 2 ++ .../prometheus-k8s-servicemonitors.yaml | 26 +++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/manifests/alertmanager/alertmanager-service.yaml b/manifests/alertmanager/alertmanager-service.yaml index 1608d14d..a5413102 100644 --- a/manifests/alertmanager/alertmanager-service.yaml +++ b/manifests/alertmanager/alertmanager-service.yaml @@ -1,6 +1,8 @@ apiVersion: v1 kind: Service metadata: + labels: + alertmanager: main name: alertmanager-main spec: type: NodePort diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests/prometheus/prometheus-k8s-service.yaml index a558f30f..5cd3b65b 100644 --- a/manifests/prometheus/prometheus-k8s-service.yaml +++ b/manifests/prometheus/prometheus-k8s-service.yaml @@ -1,6 +1,8 @@ apiVersion: v1 kind: Service metadata: + labels: + prometheus: k8s name: prometheus-k8s spec: type: NodePort diff --git a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml index 8b1f2b4a..ac2a26a1 100644 --- a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml +++ b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml @@ -44,3 +44,29 @@ spec: interval: 15s - port: http-metrics-skydns interval: 15s +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: prometheus + labels: + prometheus: k8s +spec: + endpoints: + - port: web + selector: + matchExpressions: + - {key: prometheus, operator: In, values: [k8s]} +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + labels: + alertmanager: main + name: alertmanager +spec: + endpoints: + - port: web + selector: + matchExpressions: + - {key: alertmanager, operator: In, values: [main]} \ No newline at end of file From 48bda634ce4af95b057a6ef0a3298994c9c23a14 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Thu, 20 Apr 2017 10:23:12 +0200 Subject: [PATCH 032/638] Bump default Prometheus version to 1.6.1 --- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 915ded6a..34b86ba4 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.5.2 + version: v1.6.1 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index a8a14910..1e77d7cc 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.5.2 + version: v1.6.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: From db2741f8054dc742d2bc0734b6a5ad78ddb0354d Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Thu, 20 Apr 2017 17:21:16 +0200 Subject: [PATCH 033/638] *: cut v0.8.2 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 7d11df9b..dbdd32bf 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.8.1 + image: quay.io/coreos/prometheus-operator:v0.8.2 args: - "--kubelet-object=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From ce0a9caae748c690c26578a5d6a05b88ab312ded Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 26 Apr 2017 16:09:15 +0200 Subject: [PATCH 034/638] kube-prometheus: fix deployment dashboard multiple values error --- assets/grafana/deployment-dashboard.json | 18 +++++++++--------- manifests/grafana/grafana-dashboards.yaml | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json index 7a59db7f..8bc9f40c 100644 --- a/assets/grafana/deployment-dashboard.json +++ b/assets/grafana/deployment-dashboard.json @@ -348,7 +348,7 @@ }, "targets": [ { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "metric": "kube_deployment_spec_replicas", "refId": "A", @@ -426,7 +426,7 @@ }, "targets": [ { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "refId": "A", "step": 600 @@ -503,7 +503,7 @@ }, "targets": [ { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -581,7 +581,7 @@ }, "targets": [ { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -648,35 +648,35 @@ "steppedLine": false, "targets": [ { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "current replicas", "refId": "A", "step": 30 }, { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "available", "refId": "B", "step": 30 }, { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "unavailable", "refId": "C", "step": 30 }, { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "updated", "refId": "D", "step": 30 }, { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "desired", "refId": "E", diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 7df20cac..256030a3 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1218,7 +1218,7 @@ data: }, "targets": [ { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "metric": "kube_deployment_spec_replicas", "refId": "A", @@ -1296,7 +1296,7 @@ data: }, "targets": [ { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "refId": "A", "step": 600 @@ -1373,7 +1373,7 @@ data: }, "targets": [ { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -1451,7 +1451,7 @@ data: }, "targets": [ { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -1518,35 +1518,35 @@ data: "steppedLine": false, "targets": [ { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "current replicas", "refId": "A", "step": 30 }, { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "available", "refId": "B", "step": 30 }, { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "unavailable", "refId": "C", "step": 30 }, { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "updated", "refId": "D", "step": 30 }, { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance)", + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "desired", "refId": "E", From a0e4d3352a347933ba0460dae70fa2b6c6f99fb6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 27 Apr 2017 12:33:55 +0200 Subject: [PATCH 035/638] prometheus: remove non-namespaced alpha annotation Now that we have the possibility to use the honor_labels configuration there is no need for this specially treated annotation anymore. --- manifests/exporters/kube-state-metrics-service.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/manifests/exporters/kube-state-metrics-service.yaml b/manifests/exporters/kube-state-metrics-service.yaml index 607869e1..292c4978 100644 --- a/manifests/exporters/kube-state-metrics-service.yaml +++ b/manifests/exporters/kube-state-metrics-service.yaml @@ -4,8 +4,6 @@ metadata: labels: app: kube-state-metrics k8s-app: kube-state-metrics - annotations: - alpha.monitoring.coreos.com/non-namespaced: "true" name: kube-state-metrics spec: ports: From 309c677270ad0728e2936afb4bd75d56ad7ac122 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 27 Apr 2017 12:35:06 +0200 Subject: [PATCH 036/638] kube-prometheus: extract ServiceMonitors into single files Starting with Kubernetes 1.6 `kubectl apply` works for TPRs so we can start using it, therefore using multiple files is no issue. --- hack/cluster-monitoring/deploy | 18 +---- ...heus-k8s-service-monitor-alertmanager.yaml | 12 ++++ ...metheus-k8s-service-monitor-apiserver.yaml | 23 ++++++ ...eus-k8s-service-monitor-k8s-apps-http.yaml | 23 ++++++ ...8s-service-monitor-kube-state-metrics.yaml | 19 +++++ ...rometheus-k8s-service-monitor-kubelet.yaml | 17 +++++ ...eus-k8s-service-monitor-node-exporter.yaml | 18 +++++ ...etheus-k8s-service-monitor-prometheus.yaml | 12 ++++ .../prometheus-k8s-servicemonitors.yaml | 72 ------------------- 9 files changed, 126 insertions(+), 88 deletions(-) create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml delete mode 100644 manifests/prometheus/prometheus-k8s-servicemonitors.yaml diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 3c3a5dcd..2d0d50ad 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -26,20 +26,6 @@ echo "done!" kctl apply -f manifests/exporters kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana - -kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml -kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml -kctl apply -f manifests/prometheus/prometheus-cluster-role-binding.yaml -kctl apply -f manifests/prometheus/prometheus-cluster-role.yaml -kctl apply -f manifests/prometheus/prometheus-k8s-service-account.yaml - -kctl apply -f manifests/alertmanager/alertmanager-config.yaml -kctl apply -f manifests/alertmanager/alertmanager-service.yaml - -# `kubectl apply` is currently not working for third party resources so we are -# using `kubectl create` here for the time being. -# (https://github.com/kubernetes/kubernetes/issues/29542) -kctl create -f manifests/prometheus/prometheus-k8s-servicemonitors.yaml -kctl create -f manifests/prometheus/prometheus-k8s.yaml -kctl create -f manifests/alertmanager/alertmanager.yaml +kctl apply -f manifests/prometheus/ +kctl apply -f manifests/alertmanager/ diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml new file mode 100644 index 00000000..d193b676 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + labels: + alertmanager: main + name: alertmanager +spec: + endpoints: + - port: web + selector: + matchExpressions: + - {key: alertmanager, operator: In, values: [main]} diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml new file mode 100644 index 00000000..1fd793e5 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-apps: https +spec: + jobLabel: provider + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml new file mode 100644 index 00000000..fbfcda97 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-http + namespace: monitoring + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + - {key: k8s-app, operator: NotIn, values: [kubelet]} + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: http-metrics + interval: 15s + - port: http-metrics-dnsmasq + interval: 15s + - port: http-metrics-skydns + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml new file mode 100644 index 00000000..c4ed1afc --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: kube-state-metrics + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: http-metrics + interval: 15s + honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml new file mode 100644 index 00000000..4e9aabd8 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kubelet + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: kubelet + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: http-metrics + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml new file mode 100644 index 00000000..a7b20301 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: node-exporter + namespace: monitoring + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: node-exporter + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: http-metrics + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml new file mode 100644 index 00000000..5e5d17be --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: prometheus + labels: + prometheus: k8s +spec: + endpoints: + - port: web + selector: + matchExpressions: + - {key: prometheus, operator: In, values: [k8s]} diff --git a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml b/manifests/prometheus/prometheus-k8s-servicemonitors.yaml deleted file mode 100644 index ac2a26a1..00000000 --- a/manifests/prometheus/prometheus-k8s-servicemonitors.yaml +++ /dev/null @@ -1,72 +0,0 @@ -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: kube-apiserver - labels: - k8s-apps: https -spec: - jobLabel: provider - selector: - matchLabels: - component: apiserver - provider: kubernetes - namespaceSelector: - matchNames: - - default - endpoints: - - port: https - interval: 15s - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: k8s-apps-http - labels: - k8s-apps: http -spec: - jobLabel: k8s-app - selector: - matchExpressions: - - {key: k8s-app, operator: Exists} - namespaceSelector: - matchNames: - - kube-system - - monitoring - endpoints: - - port: http-metrics - interval: 15s - - port: http-metrics-dnsmasq - interval: 15s - - port: http-metrics-skydns - interval: 15s ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: prometheus - labels: - prometheus: k8s -spec: - endpoints: - - port: web - selector: - matchExpressions: - - {key: prometheus, operator: In, values: [k8s]} ---- -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - labels: - alertmanager: main - name: alertmanager -spec: - endpoints: - - port: web - selector: - matchExpressions: - - {key: alertmanager, operator: In, values: [main]} \ No newline at end of file From b94bb77a66197d8fbec4a33815762e126d6bad57 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 28 Apr 2017 14:34:13 +0200 Subject: [PATCH 037/638] alertmanager: bump default version to v0.6.1 --- manifests/alertmanager/alertmanager.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index fbd2d452..49f71250 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: "main" spec: replicas: 3 - version: v0.5.1 + version: v0.6.1 From d53553444b718180b55d4a6be2b31ade93343546 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Tue, 2 May 2017 12:10:44 +0200 Subject: [PATCH 038/638] Remove unnecessary prometheus- prefix --- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 34b86ba4..e1466945 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -1,7 +1,7 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: Prometheus metadata: - name: prometheus-frontend + name: frontend namespace: default labels: prometheus: frontend From 6a12f669f7cf0ac2ba39ee4290022a7577e245a2 Mon Sep 17 00:00:00 2001 From: Tapani Moilanen Date: Wed, 3 May 2017 20:02:22 +0300 Subject: [PATCH 039/638] Use kubectl apply in custom service example deploy https://github.com/kubernetes/kubernetes/issues/29542 has been fixed, use apply instead of create to create third party resources in custom service monitoring example. --- hack/example-service-monitoring/deploy | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy index 420b5940..7691047c 100755 --- a/hack/example-service-monitoring/deploy +++ b/hack/example-service-monitoring/deploy @@ -10,10 +10,6 @@ fi kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-svc.yaml kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/example-app.yaml - -# `kubectl apply` is currently not working for third party resources so we are -# using `kubectl create` here for the time being. -# (https://github.com/kubernetes/kubernetes/issues/29542) -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/prometheus-frontend.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" create -f manifests/examples/example-app/servicemonitor-frontend.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/servicemonitor-frontend.yaml From fadece6188957e36e9ec3f5ffb768d4e7b20a5be Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 4 May 2017 10:47:07 +0200 Subject: [PATCH 040/638] kube-prometheus: make all RBAC manifests v1beta1 --- .../exporters/kube-state-metrics-cluster-role-binding.yaml | 2 +- manifests/exporters/kube-state-metrics-cluster-role.yaml | 2 +- .../prometheus-operator-cluster-role-binding.yaml | 2 +- .../prometheus-operator/prometheus-operator-cluster-role.yaml | 2 +- manifests/prometheus/prometheus-cluster-role-binding.yaml | 2 +- manifests/prometheus/prometheus-cluster-role.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml b/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml index d7e421e6..8284fc15 100644 --- a/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml +++ b/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: kube-state-metrics diff --git a/manifests/exporters/kube-state-metrics-cluster-role.yaml b/manifests/exporters/kube-state-metrics-cluster-role.yaml index fdbd41db..e4b30cf5 100644 --- a/manifests/exporters/kube-state-metrics-cluster-role.yaml +++ b/manifests/exporters/kube-state-metrics-cluster-role.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: kube-state-metrics diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml index 5ea7b9c5..e7e03a29 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index c7bebb9d..2f248651 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus-operator diff --git a/manifests/prometheus/prometheus-cluster-role-binding.yaml b/manifests/prometheus/prometheus-cluster-role-binding.yaml index e337527f..3600490f 100644 --- a/manifests/prometheus/prometheus-cluster-role-binding.yaml +++ b/manifests/prometheus/prometheus-cluster-role-binding.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml index 0a8526d5..a85422ec 100644 --- a/manifests/prometheus/prometheus-cluster-role.yaml +++ b/manifests/prometheus/prometheus-cluster-role.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1alpha1 +apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus From 3e4f0d1115cb682ca42b7fd8fc1e45dd7f99fa78 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 4 May 2017 10:49:05 +0200 Subject: [PATCH 041/638] kube-prometheus: separate exporters in their own directories --- hack/cluster-monitoring/deploy | 3 ++- hack/cluster-monitoring/teardown | 3 ++- .../kube-state-metrics-cluster-role-binding.yaml | 0 .../kube-state-metrics-cluster-role.yaml | 0 .../kube-state-metrics-deployment.yaml | 0 .../kube-state-metrics-service-account.yaml | 0 .../kube-state-metrics-service.yaml | 0 .../{exporters => node-exporter}/node-exporter-daemonset.yaml | 0 .../{exporters => node-exporter}/node-exporter-service.yaml | 0 9 files changed, 4 insertions(+), 2 deletions(-) rename manifests/{exporters => kube-state-metrics}/kube-state-metrics-cluster-role-binding.yaml (100%) rename manifests/{exporters => kube-state-metrics}/kube-state-metrics-cluster-role.yaml (100%) rename manifests/{exporters => kube-state-metrics}/kube-state-metrics-deployment.yaml (100%) rename manifests/{exporters => kube-state-metrics}/kube-state-metrics-service-account.yaml (100%) rename manifests/{exporters => kube-state-metrics}/kube-state-metrics-service.yaml (100%) rename manifests/{exporters => node-exporter}/node-exporter-daemonset.yaml (100%) rename manifests/{exporters => node-exporter}/node-exporter-service.yaml (100%) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 2d0d50ad..c565d442 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -23,7 +23,8 @@ until kctl get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done until kctl get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" -kctl apply -f manifests/exporters +kctl apply -f manifests/node-exporter +kctl apply -f manifests/kube-state-metrics kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana kctl apply -f manifests/prometheus/ diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index e5e0d9a6..9fcc4513 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -12,7 +12,8 @@ kctl() { kubectl --namespace "$NAMESPACE" "$@" } -kctl delete -f manifests/exporters +kctl delete -f manifests/node-exporter +kctl delete -f manifests/kube-state-metrics kctl delete -f manifests/grafana kctl delete -f manifests/prometheus kctl delete -f manifests/alertmanager diff --git a/manifests/exporters/kube-state-metrics-cluster-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml similarity index 100% rename from manifests/exporters/kube-state-metrics-cluster-role-binding.yaml rename to manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml diff --git a/manifests/exporters/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml similarity index 100% rename from manifests/exporters/kube-state-metrics-cluster-role.yaml rename to manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml diff --git a/manifests/exporters/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml similarity index 100% rename from manifests/exporters/kube-state-metrics-deployment.yaml rename to manifests/kube-state-metrics/kube-state-metrics-deployment.yaml diff --git a/manifests/exporters/kube-state-metrics-service-account.yaml b/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml similarity index 100% rename from manifests/exporters/kube-state-metrics-service-account.yaml rename to manifests/kube-state-metrics/kube-state-metrics-service-account.yaml diff --git a/manifests/exporters/kube-state-metrics-service.yaml b/manifests/kube-state-metrics/kube-state-metrics-service.yaml similarity index 100% rename from manifests/exporters/kube-state-metrics-service.yaml rename to manifests/kube-state-metrics/kube-state-metrics-service.yaml diff --git a/manifests/exporters/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml similarity index 100% rename from manifests/exporters/node-exporter-daemonset.yaml rename to manifests/node-exporter/node-exporter-daemonset.yaml diff --git a/manifests/exporters/node-exporter-service.yaml b/manifests/node-exporter/node-exporter-service.yaml similarity index 100% rename from manifests/exporters/node-exporter-service.yaml rename to manifests/node-exporter/node-exporter-service.yaml From b15843fd4c9a17a005126452e837784f8b49f8f9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 4 May 2017 10:51:30 +0200 Subject: [PATCH 042/638] kube-prometheus: update kube-state-metrics to latest release --- manifests/kube-state-metrics/kube-state-metrics-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 8e2a2893..bc5f0400 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -12,7 +12,7 @@ spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics - image: gcr.io/google_containers/kube-state-metrics:v0.4.1 + image: quay.io/coreos/kube-state-metrics:v0.5.0 ports: - name: metrics containerPort: 8080 From c6c67bb4507915275b7bedfed577fa6e95eaba32 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 4 May 2017 10:52:21 +0200 Subject: [PATCH 043/638] kube-prometheus: update node-exporter to leatest release --- manifests/node-exporter/node-exporter-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index 8c9565ba..d42413cb 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -12,7 +12,7 @@ spec: hostNetwork: true hostPID: true containers: - - image: quay.io/prometheus/node-exporter:v0.13.0 + - image: quay.io/prometheus/node-exporter:v0.14.0 args: - "-collector.procfs=/host/proc" - "-collector.sysfs=/host/sys" From 64c959e018e4dfcaf75c86d706a2b9e0b160a241 Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Fri, 5 May 2017 19:04:20 +0200 Subject: [PATCH 044/638] Add ability to authenticate an endpoint Write an example for basic auth --- manifests/examples/basic-auth/secrets.yaml | 8 +++++++ .../examples/basic-auth/service-monitor.yaml | 22 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 manifests/examples/basic-auth/secrets.yaml create mode 100644 manifests/examples/basic-auth/service-monitor.yaml diff --git a/manifests/examples/basic-auth/secrets.yaml b/manifests/examples/basic-auth/secrets.yaml new file mode 100644 index 00000000..fa0dd897 --- /dev/null +++ b/manifests/examples/basic-auth/secrets.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth +data: + password: dG9vcg== # toor + user: YWRtaW4= # admin +type: Opaque \ No newline at end of file diff --git a/manifests/examples/basic-auth/service-monitor.yaml b/manifests/examples/basic-auth/service-monitor.yaml new file mode 100644 index 00000000..8d9549f0 --- /dev/null +++ b/manifests/examples/basic-auth/service-monitor.yaml @@ -0,0 +1,22 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + labels: + k8s-apps: basic-auth-example + name: basic-auth-example +spec: + endpoints: + - basicAuth: + password: + key: basic-auth + name: password + username: + key: basic-auth + name: user + port: metrics + namespaceSelector: + matchNames: + - logging + selector: + matchLabels: + app: myapp \ No newline at end of file From 33ab6917fe5bfc6e0431d9d14b264320ec8bda44 Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Mon, 8 May 2017 16:53:58 +0200 Subject: [PATCH 045/638] Watch secrets --- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 7327b0db..71f72da1 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -516,4 +516,4 @@ data: summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } - + From d672109608eafd0ae51ab633a1bcd745c1f289d1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 May 2017 10:20:29 +0200 Subject: [PATCH 046/638] cmd/operator: rename kubelet-object flag to kubelet-service --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index dbdd32bf..8be3b967 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -16,7 +16,7 @@ spec: - name: prometheus-operator image: quay.io/coreos/prometheus-operator:v0.8.2 args: - - "--kubelet-object=kube-system/kubelet" + - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" resources: requests: From e810357b8f24f5d00c1ea79fbe6329eb1125067a Mon Sep 17 00:00:00 2001 From: Gytis Date: Tue, 9 May 2017 12:15:59 +0300 Subject: [PATCH 047/638] Rename kube_pod_container_requested_memory_bytes -> kube_pod_container_resource_requests_memory_bytes in grafana dashboard --- assets/grafana/kubernetes-pods-dashboard.json | 4 ++-- manifests/grafana/grafana-dashboards.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/grafana/kubernetes-pods-dashboard.json b/assets/grafana/kubernetes-pods-dashboard.json index 24036f3e..3428f36b 100644 --- a/assets/grafana/kubernetes-pods-dashboard.json +++ b/assets/grafana/kubernetes-pods-dashboard.json @@ -92,11 +92,11 @@ "step": 10 }, { - "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", "interval": "10s", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_requested_memory_bytes", + "metric": "kube_pod_container_resource_requests_memory_bytes", "refId": "B", "step": 20 } diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 256030a3..92a88fbf 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1782,11 +1782,11 @@ data: "step": 10 }, { - "expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", "interval": "10s", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_requested_memory_bytes", + "metric": "kube_pod_container_resource_requests_memory_bytes", "refId": "B", "step": 20 } From e5938d1205935540f09727623c8d71f7532b38bf Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 May 2017 12:24:00 +0200 Subject: [PATCH 048/638] *: bump Prometheus Operator tags in manifests --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 8be3b967..24c8f86a 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.8.2 + image: quay.io/coreos/prometheus-operator:v0.9.0 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From c4491cec42f829214ac5a13d7718e6f71c6ab7a6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 May 2017 15:22:24 +0200 Subject: [PATCH 049/638] kube-prometheus: use honor_labels for kubelet config --- manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 4e9aabd8..5729d8f0 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -15,3 +15,4 @@ spec: endpoints: - port: http-metrics interval: 15s + honorLabels: true From fa46bd8dc01f22382c1e14306db624d1f2272d96 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 May 2017 15:23:23 +0200 Subject: [PATCH 050/638] *: bump Alertmanager version --- manifests/alertmanager/alertmanager.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index 49f71250..f2e7372a 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: "main" spec: replicas: 3 - version: v0.6.1 + version: v0.6.2 From b896acf876e3d96eda0ec0d5b5418641434d1180 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Wed, 17 May 2017 17:27:56 +0200 Subject: [PATCH 051/638] Make kube-state-metrics single pod deployment --- manifests/kube-state-metrics/kube-state-metrics-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index bc5f0400..7d98d43e 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: name: kube-state-metrics spec: - replicas: 2 + replicas: 1 template: metadata: labels: From fbab1e4212a115b0305015792f762a8e8532edb6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 18 May 2017 11:15:34 +0200 Subject: [PATCH 052/638] *: bump versions --- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index e1466945..84a3238a 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.6.1 + version: v1.6.3 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 24c8f86a..573aaf29 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.9.0 + image: quay.io/coreos/prometheus-operator:v0.9.1 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 1e77d7cc..b7060ba6 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.6.1 + version: v1.6.3 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: From f0851d5e4da6d2174c2063d678e753fa71edaeac Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 29 May 2017 18:29:39 -0700 Subject: [PATCH 053/638] kube-prometheus: add prometheus-operator as target --- .../prometheus-operator-service.yaml | 15 +++++++++ .../prometheus-operator.yaml | 31 ++++++++++--------- ...s-service-monitor-prometheus-operator.yaml | 12 +++++++ 3 files changed, 44 insertions(+), 14 deletions(-) create mode 100644 manifests/prometheus-operator/prometheus-operator-service.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/prometheus-operator/prometheus-operator-service.yaml new file mode 100644 index 00000000..8882d4a7 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + type: ClusterIP + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + selector: + k8s-app: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 573aaf29..97b1cafb 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -3,25 +3,28 @@ kind: Deployment metadata: name: prometheus-operator labels: - operator: prometheus + k8s-app: prometheus-operator spec: replicas: 1 template: metadata: labels: - operator: prometheus + k8s-app: prometheus-operator spec: serviceAccountName: prometheus-operator containers: - - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.9.1 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" - resources: - requests: - cpu: 100m - memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + - name: prometheus-operator + image: quay.io/coreos/prometheus-operator:v0.9.1 + args: + - "--kubelet-service=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + ports: + - name: http + containerPort: 8080 + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 300Mi diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml new file mode 100644 index 00000000..23c04073 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + endpoints: + - port: http + selector: + matchLabels: + k8s-app: prometheus-operator From c4b382be6f542556235eb35353eb4fa0a6f1e9ab Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sat, 27 May 2017 10:44:33 +0200 Subject: [PATCH 054/638] kube-prometheus: add alerting rules --- assets/prometheus/rules/alertmanager.rules | 36 + assets/prometheus/rules/etcd2.rules | 121 ---- assets/prometheus/rules/etcd3.rules | 177 +++++ assets/prometheus/rules/general.rules | 97 +++ assets/prometheus/rules/kube-apiserver.rules | 38 ++ .../rules/kube-controller-manager.rules | 10 + assets/prometheus/rules/kube-scheduler.rules | 10 + assets/prometheus/rules/kubelet.rules | 61 ++ assets/prometheus/rules/kubernetes.rules | 217 ------ assets/prometheus/rules/node.rules | 10 + assets/prometheus/rules/prometheus.rules | 10 + .../prometheus/prometheus-k8s-rules.yaml | 639 +++++++++++------- 12 files changed, 828 insertions(+), 598 deletions(-) create mode 100644 assets/prometheus/rules/alertmanager.rules delete mode 100644 assets/prometheus/rules/etcd2.rules create mode 100644 assets/prometheus/rules/etcd3.rules create mode 100644 assets/prometheus/rules/general.rules create mode 100644 assets/prometheus/rules/kube-apiserver.rules create mode 100644 assets/prometheus/rules/kube-controller-manager.rules create mode 100644 assets/prometheus/rules/kube-scheduler.rules create mode 100644 assets/prometheus/rules/kubelet.rules create mode 100644 assets/prometheus/rules/node.rules create mode 100644 assets/prometheus/rules/prometheus.rules diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules new file mode 100644 index 00000000..71bdc687 --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules @@ -0,0 +1,36 @@ +ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." + } + +ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." + } + +ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules deleted file mode 100644 index 10fa5e8d..00000000 --- a/assets/prometheus/rules/etcd2.rules +++ /dev/null @@ -1,121 +0,0 @@ -### General cluster availability ### - -# alert if another failed peer will result in an unavailable cluster -ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - -### HTTP requests alerts ### - -# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if 50% of requests get a 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", - } - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", - } - -### File descriptor alerts ### - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -### etcd proposal alerts ### - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - -### etcd disk io latency alerts ### - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", - } diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules new file mode 100644 index 00000000..a3b2cddd --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules @@ -0,0 +1,177 @@ +# general cluster availability + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers +IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) +FOR 3m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", +} + +# etcd leader alerts +# ================== + +# alert if any etcd instance has no leader +ALERT NoLeader +IF etcd_server_has_leader{job="etcd"} == 0 +FOR 1m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", +} + +# alert if there are lots of leader changes +ALERT HighNumberOfLeaderChanges +IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", +} + +# gRPC request alerts +# =================== + +# alert if more than 1% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of gRPC method calls take more than 150ms +ALERT GRPCRequestsSlow +IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", +} + +# HTTP requests alerts +# ==================== + +# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow +IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", +} + +# etcd member communication alerts +# ================================ + +# alert if 99th percentile of round trips take 150ms +ALERT EtcdMemberCommunicationSlow +IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", +} + +# etcd proposal alerts +# ==================== + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals +IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", +} + +# etcd disk io latency alerts +# =========================== + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations +IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", +} + +# alert if 99th percentile of commit durations is higher than 250ms +ALERT HighCommitDurations +IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", +} diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules new file mode 100644 index 00000000..9a8f931f --- /dev/null +++ b/assets/prometheus/rules/general.rules @@ -0,0 +1,97 @@ +### Up Alerting ### + +Alert TargetDown + IF up == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "target is down", + description = "A target of type {{ $labels.job }} is down." + } + +### File descriptor alerts ### + +ALERT TooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 80 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + } + +### Contrack alerts ### + +# To catch the conntrack sysctl de-tuning when it happens +ALERT ConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + +ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules new file mode 100644 index 00000000..fadaf5b4 --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -0,0 +1,38 @@ +ALERT K8SApiserverDown + IF up{job="apiserver"} == 0 + FOR 15m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + +# Disable for non HA kubernetes setups. +ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules new file mode 100644 index 00000000..f75e2768 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -0,0 +1,10 @@ +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules new file mode 100644 index 00000000..6eff4bcd --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -0,0 +1,10 @@ +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules new file mode 100644 index 00000000..c3cc8e9b --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules @@ -0,0 +1,61 @@ +ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index 157eb3fa..084d11e5 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - -ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - -ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. -ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - } - -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -# To catch the conntrack sysctl de-tuning when it happens -ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - -ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules new file mode 100644 index 00000000..8fd5b7d0 --- /dev/null +++ b/assets/prometheus/rules/node.rules @@ -0,0 +1,10 @@ +ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", + } diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules new file mode 100644 index 00000000..05c278f1 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules @@ -0,0 +1,10 @@ +ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 71f72da1..f57b6785 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,76 +6,259 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - etcd2.rules: |+ - ### General cluster availability ### - - # alert if another failed peer will result in an unavailable cluster - ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - - ### HTTP requests alerts ### - - # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + alertmanager.rules: |+ + ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { severity = "critical" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." } - # alert if 50% of requests get a 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m + ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." } - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } + etcd3.rules: |+ + # general cluster availability + + # alert if another failed member will result in an unavailable cluster + ALERT InsufficientMembers + IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", + } + + # etcd leader alerts + # ================== + + # alert if any etcd instance has no leader + ALERT NoLeader + IF etcd_server_has_leader{job="etcd"} == 0 + FOR 1m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", + } + + # alert if there are lots of leader changes + ALERT HighNumberOfLeaderChanges + IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", + } + + # gRPC request alerts + # =================== + + # alert if more than 1% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of gRPC method calls take more than 150ms + ALERT GRPCRequestsSlow + IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + } + + # HTTP requests alerts + # ==================== + + # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + # etcd member communication alerts + # ================================ + + # alert if 99th percentile of round trips take 150ms + ALERT EtcdMemberCommunicationSlow + IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + } + + # etcd proposal alerts + # ==================== + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + # etcd disk io latency alerts + # =========================== + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", + } + + # alert if 99th percentile of commit durations is higher than 250ms + ALERT HighCommitDurations + IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", + } + general.rules: |+ + ### Up Alerting ### + + Alert TargetDown + IF up == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "target is down", + description = "A target of type {{ $labels.job }} is down." } ### File descriptor alerts ### + ALERT TooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + + ALERT K8STooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 80 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours @@ -87,7 +270,7 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -99,34 +282,154 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } - ### etcd proposal alerts ### + ### Contrack alerts ### - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + # To catch the conntrack sysctl de-tuning when it happens + ALERT ConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m LABELS { - severity = "warning" + severity = "warning", } ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", } - ### etcd disk io latency alerts ### - - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + kube-apiserver.rules: |+ + ALERT K8SApiserverDown + IF up{job="apiserver"} == 0 + FOR 15m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + + # Disable for non HA kubernetes setups. + ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + kube-controller-manager.rules: |+ + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + kubelet.rules: |+ + ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. @@ -300,220 +603,36 @@ data: histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - - ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - - ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - - ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - - # Disable for non HA kubernetes setups. - ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - + kube-scheduler.rules: |+ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + node.rules: |+ + ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 FOR 10m LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - # To catch the conntrack sysctl de-tuning when it happens - ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 + prometheus.rules: |+ + ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - - ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - From 3aeca3de7ba242e68c94db6b263091ff79a9a7ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 30 May 2017 16:48:52 -0700 Subject: [PATCH 055/638] kube-prometheus: simplify service monitors --- ...heus-k8s-service-monitor-alertmanager.yaml | 14 +++++++---- ...metheus-k8s-service-monitor-apiserver.yaml | 6 ++--- ...eus-k8s-service-monitor-k8s-apps-http.yaml | 23 ------------------- ...rvice-monitor-kube-controller-manager.yaml | 17 ++++++++++++++ ...us-k8s-service-monitor-kube-scheduler.yaml | 17 ++++++++++++++ ...8s-service-monitor-kube-state-metrics.yaml | 5 ++-- ...rometheus-k8s-service-monitor-kubelet.yaml | 10 ++++---- ...eus-k8s-service-monitor-node-exporter.yaml | 5 ++-- ...etheus-k8s-service-monitor-prometheus.yaml | 12 ++++++---- 9 files changed, 63 insertions(+), 46 deletions(-) delete mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml index d193b676..29d68c82 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -1,12 +1,16 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: - labels: - alertmanager: main name: alertmanager + labels: + app: alertmanager spec: + selector: + matchLabels: + alertmanager: main + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: alertmanager, operator: In, values: [main]} + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml index 1fd793e5..09a87c2e 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -3,9 +3,9 @@ kind: ServiceMonitor metadata: name: kube-apiserver labels: - k8s-apps: https + k8s-app: apiserver spec: - jobLabel: provider + jobLabel: component selector: matchLabels: component: apiserver @@ -15,7 +15,7 @@ spec: - default endpoints: - port: https - interval: 15s + interval: 30s scheme: https tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml deleted file mode 100644 index fbfcda97..00000000 --- a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: k8s-apps-http - namespace: monitoring - labels: - k8s-apps: http -spec: - jobLabel: k8s-app - selector: - matchExpressions: - - {key: k8s-app, operator: Exists} - - {key: k8s-app, operator: NotIn, values: [kubelet]} - namespaceSelector: - matchNames: - - kube-system - endpoints: - - port: http-metrics - interval: 15s - - port: http-metrics-dnsmasq - interval: 15s - - port: http-metrics-skydns - interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml new file mode 100644 index 00000000..eef95a84 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-controller-manager + labels: + k8s-app: kube-controller-manager +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-controller-manager + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml new file mode 100644 index 00000000..663f8cfb --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-scheduler + labels: + k8s-app: kube-scheduler +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-scheduler + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml index c4ed1afc..a276702a 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: kube-state-metrics - namespace: monitoring labels: - k8s-apps: http + k8s-app: kube-state-metrics spec: jobLabel: k8s-app selector: @@ -15,5 +14,5 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 5729d8f0..cdc3ffb6 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -3,16 +3,16 @@ kind: ServiceMonitor metadata: name: kubelet labels: - k8s-apps: http + k8s-app: kubelet spec: jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + honorLabels: true selector: matchLabels: k8s-app: kubelet namespaceSelector: matchNames: - kube-system - endpoints: - - port: http-metrics - interval: 15s - honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml index a7b20301..b68ed89f 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: node-exporter - namespace: monitoring labels: - k8s-apps: http + k8s-app: node-exporter spec: jobLabel: k8s-app selector: @@ -15,4 +14,4 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml index 5e5d17be..be74cd6d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -3,10 +3,14 @@ kind: ServiceMonitor metadata: name: prometheus labels: - prometheus: k8s + app: prometheus spec: + selector: + matchLabels: + prometheus: k8s + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: prometheus, operator: In, values: [k8s]} + interval: 30s From 804f6c187bcf99434490e13b930fee7fa5d61a1c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 30 May 2017 17:07:13 -0700 Subject: [PATCH 056/638] kube-prometheus: add dead man's switch --- assets/alertmanager/alertmanager.yaml | 10 ++++++---- assets/prometheus/rules/general.rules | 12 ++++++++++++ manifests/alertmanager/alertmanager-config.yaml | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 12 ++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml index f08a2106..6b5789b5 100644 --- a/assets/alertmanager/alertmanager.yaml +++ b/assets/alertmanager/alertmanager.yaml @@ -5,8 +5,10 @@ route: group_wait: 30s group_interval: 5m repeat_interval: 12h - receiver: 'webhook' + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' receivers: -- name: 'webhook' - webhook_configs: - - url: 'http://alertmanagerwh:30500/' +- name: 'null' diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 9a8f931f..ac782297 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -11,6 +11,18 @@ Alert TargetDown description = "A target of type {{ $labels.job }} is down." } +### Dead man's switch ### + +ALERT DeadMansSwitch + IF vector(1) + LABELS { + severity = "none", + } + ANNOTATIONS { + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", + } + ### File descriptor alerts ### ALERT TooManyOpenFiles diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index eee36b33..62d39016 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -3,4 +3,4 @@ kind: Secret metadata: name: alertmanager-main data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index f57b6785..c157909f 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -235,6 +235,18 @@ data: description = "A target of type {{ $labels.job }} is down." } + ### Dead man's switch ### + + ALERT DeadMansSwitch + IF vector(1) + LABELS { + severity = "none", + } + ANNOTATIONS { + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", + } + ### File descriptor alerts ### ALERT TooManyOpenFiles From 666d7aaa0e6378a6779a4f4089be146c9be057b5 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 31 May 2017 10:30:29 +0200 Subject: [PATCH 057/638] namespace has to be kube-system In order for this service to be discovered by ServiceMonitor k8s-apps-http, it should belong to namespace kube-system. --- manifests/k8s/minikube/kube-scheduler.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml index b3b51f38..9815a6da 100644 --- a/manifests/k8s/minikube/kube-scheduler.yaml +++ b/manifests/k8s/minikube/kube-scheduler.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-scheduler-prometheus-discovery labels: k8s-app: kube-scheduler From 2e67b4084ffb5d5c9843f691bc315cdca424458e Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 31 May 2017 10:33:32 +0200 Subject: [PATCH 058/638] namespace has to be kube-system --- manifests/k8s/minikube/kube-scheduler.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml index 9815a6da..8599575c 100644 --- a/manifests/k8s/minikube/kube-scheduler.yaml +++ b/manifests/k8s/minikube/kube-scheduler.yaml @@ -17,6 +17,7 @@ spec: apiVersion: v1 kind: Endpoints metadata: + namespace: kube-system name: kube-scheduler-prometheus-discovery labels: k8s-app: kube-scheduler From 06cff8e9d2ad4923fb7d8c8c6245dc4807d9d4c5 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 31 May 2017 10:35:22 +0200 Subject: [PATCH 059/638] namespace has to be kube-system for service discovery to work within prometheus configuration (ServiceMonitor k8s-app) these service and endpoints should belong to kube-system namespace. Otherwise the service monitor won't fetch it. --- manifests/k8s/minikube/kube-controller-manager.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manifests/k8s/minikube/kube-controller-manager.yaml b/manifests/k8s/minikube/kube-controller-manager.yaml index 135dd24c..d33015aa 100644 --- a/manifests/k8s/minikube/kube-controller-manager.yaml +++ b/manifests/k8s/minikube/kube-controller-manager.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-controller-manager-prometheus-discovery labels: k8s-app: kube-controller-manager @@ -16,6 +17,7 @@ spec: apiVersion: v1 kind: Endpoints metadata: + namespace: kube-system name: kube-controller-manager-prometheus-discovery labels: k8s-app: kube-controller-manager From 04231a269cedb943c69b9e486cf87de01e86f58b Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 31 May 2017 11:29:42 +0200 Subject: [PATCH 060/638] selector was incorrect selector prometheus: prometheus-frontend is incorrect, the name of the Prometheus resource in the example is "frontend", not "prometheus-frontend" --- manifests/examples/example-app/prometheus-frontend-svc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/examples/example-app/prometheus-frontend-svc.yaml b/manifests/examples/example-app/prometheus-frontend-svc.yaml index 9b8ecbb4..6a269155 100644 --- a/manifests/examples/example-app/prometheus-frontend-svc.yaml +++ b/manifests/examples/example-app/prometheus-frontend-svc.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - prometheus: prometheus-frontend + prometheus: frontend From 30cbd769440b3cc635e78fadb6da8615cd9ca8b4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 31 May 2017 06:39:35 -0700 Subject: [PATCH 061/638] kube-prometheus: add PROXY verb to latency alert exclusion --- assets/prometheus/rules/kube-apiserver.rules | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index fadaf5b4..6c58fe52 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -26,7 +26,7 @@ ALERT K8SApiserverDown ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) ) / 1e6 > 1.0 FOR 10m LABELS { diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index c157909f..23638f35 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -360,7 +360,7 @@ data: ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) ) / 1e6 > 1.0 FOR 10m LABELS { From 3238ba257ada3116c0784ba299e1c9d2ff2c4103 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 31 May 2017 20:55:56 +0200 Subject: [PATCH 062/638] small changes in k8s/minikube and example-app (#392) * namespace has to be kube-system In order for this service to be discovered by ServiceMonitor k8s-apps-http, it should belong to namespace kube-system. * namespace has to be kube-system * namespace has to be kube-system for service discovery to work within prometheus configuration (ServiceMonitor k8s-app) these service and endpoints should belong to kube-system namespace. Otherwise the service monitor won't fetch it. * selector was incorrect selector prometheus: prometheus-frontend is incorrect, the name of the Prometheus resource in the example is "frontend", not "prometheus-frontend" --- manifests/examples/example-app/prometheus-frontend-svc.yaml | 2 +- manifests/k8s/minikube/kube-controller-manager.yaml | 2 ++ manifests/k8s/minikube/kube-scheduler.yaml | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/manifests/examples/example-app/prometheus-frontend-svc.yaml b/manifests/examples/example-app/prometheus-frontend-svc.yaml index 9b8ecbb4..6a269155 100644 --- a/manifests/examples/example-app/prometheus-frontend-svc.yaml +++ b/manifests/examples/example-app/prometheus-frontend-svc.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - prometheus: prometheus-frontend + prometheus: frontend diff --git a/manifests/k8s/minikube/kube-controller-manager.yaml b/manifests/k8s/minikube/kube-controller-manager.yaml index 135dd24c..d33015aa 100644 --- a/manifests/k8s/minikube/kube-controller-manager.yaml +++ b/manifests/k8s/minikube/kube-controller-manager.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-controller-manager-prometheus-discovery labels: k8s-app: kube-controller-manager @@ -16,6 +17,7 @@ spec: apiVersion: v1 kind: Endpoints metadata: + namespace: kube-system name: kube-controller-manager-prometheus-discovery labels: k8s-app: kube-controller-manager diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml index b3b51f38..8599575c 100644 --- a/manifests/k8s/minikube/kube-scheduler.yaml +++ b/manifests/k8s/minikube/kube-scheduler.yaml @@ -1,6 +1,7 @@ apiVersion: v1 kind: Service metadata: + namespace: kube-system name: kube-scheduler-prometheus-discovery labels: k8s-app: kube-scheduler @@ -16,6 +17,7 @@ spec: apiVersion: v1 kind: Endpoints metadata: + namespace: kube-system name: kube-scheduler-prometheus-discovery labels: k8s-app: kube-scheduler From 0c35d73e2ccec5941ec49728c8875bcf5926efa8 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 6 Jun 2017 15:22:28 +0200 Subject: [PATCH 063/638] kube-prometheus: drop conntrack alerts and direct up alerts --- assets/prometheus/rules/general.rules | 62 +++---------------- assets/prometheus/rules/kube-apiserver.rules | 14 +---- assets/prometheus/rules/kubelet.rules | 29 +++++---- .../prometheus/prometheus-k8s-rules.yaml | 1 + 4 files changed, 24 insertions(+), 82 deletions(-) diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index ac782297..9e26ab9a 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -1,14 +1,14 @@ ### Up Alerting ### Alert TargetDown - IF up == 0 + IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "target is down", - description = "A target of type {{ $labels.job }} is down." + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." } ### Dead man's switch ### @@ -25,26 +25,15 @@ ALERT DeadMansSwitch ### File descriptor alerts ### -ALERT TooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 80 +ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds @@ -58,7 +47,7 @@ ALERT FdExhaustionClose } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -70,40 +59,5 @@ ALERT FdExhaustionClose } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", - } - -### Contrack alerts ### - -# To catch the conntrack sysctl de-tuning when it happens -ALERT ConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index 6c58fe52..8d8d1392 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -1,15 +1,3 @@ -ALERT K8SApiserverDown - IF up{job="apiserver"} == 0 - FOR 15m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m @@ -18,7 +6,7 @@ ALERT K8SApiserverDown } ANNOTATIONS { summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } # Some verbs excluded because they are expected to be long-lasting: diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index c3cc8e9b..cbcd576c 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -1,14 +1,3 @@ -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h @@ -39,15 +28,25 @@ ALERT K8SManyNodesNotReady } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 FOR 1h LABELS { - service = "k8s", - severity = "critical" + severity = "warning", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SKubeletTooManyPods diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 23638f35..14284560 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -625,6 +625,7 @@ data: ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://github.com/coreos/tectonic-installer/blob/master/Documentation/troubleshooting/controller-recovery.md#disaster-recovery-of-scheduler-and-controller-manager-pods" } node.rules: |+ ALERT NodeExporterDown From 4da7a872ba971b1c1505da2f4ae19883d19c0740 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 6 Jun 2017 15:34:10 +0200 Subject: [PATCH 064/638] kube-prometheus: add comment on apiserver latency unit --- assets/prometheus/rules/kube-apiserver.rules | 2 ++ 1 file changed, 2 insertions(+) diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index 8d8d1392..c041881a 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -11,6 +11,8 @@ ALERT K8SApiserverDown # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. +# +# apiserver_request_latencies' unit is microseconds ALERT K8SApiServerLatency IF histogram_quantile( 0.99, From 1a457371bce624790e267a29294c24643be60fef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 6 Jun 2017 16:00:17 +0200 Subject: [PATCH 065/638] kube-prometheus: regenerate the rule file configmap manifest --- .../prometheus/prometheus-k8s-rules.yaml | 108 ++++-------------- 1 file changed, 25 insertions(+), 83 deletions(-) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 14284560..cb062db1 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -225,14 +225,14 @@ data: ### Up Alerting ### Alert TargetDown - IF up == 0 + IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "target is down", - description = "A target of type {{ $labels.job }} is down." + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." } ### Dead man's switch ### @@ -249,26 +249,15 @@ data: ### File descriptor alerts ### - ALERT TooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", - } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 80 + ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds @@ -282,7 +271,7 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -294,56 +283,9 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", - } - - ### Contrack alerts ### - - # To catch the conntrack sysctl de-tuning when it happens - ALERT ConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - - ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } kube-apiserver.rules: |+ - ALERT K8SApiserverDown - IF up{job="apiserver"} == 0 - FOR 15m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - - # Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m @@ -352,11 +294,13 @@ data: } ANNOTATIONS { summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + # + # apiserver_request_latencies' unit is microseconds ALERT K8SApiServerLatency IF histogram_quantile( 0.99, @@ -382,17 +326,6 @@ data: description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", } kubelet.rules: |+ - ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h @@ -423,15 +356,25 @@ data: } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 FOR 1h LABELS { - service = "k8s", - severity = "critical" + severity = "warning", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SKubeletTooManyPods @@ -625,7 +568,6 @@ data: ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - runbook = "https://github.com/coreos/tectonic-installer/blob/master/Documentation/troubleshooting/controller-recovery.md#disaster-recovery-of-scheduler-and-controller-manager-pods" } node.rules: |+ ALERT NodeExporterDown From 373e5cf0965754c2c4eb807d69296ca04bc66651 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 7 Jun 2017 13:17:11 +0200 Subject: [PATCH 066/638] *: use Prometheus 1.7 --- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 84a3238a..c092d8e2 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.6.3 + version: v1.7.0 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index b7060ba6..63e9c3f7 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.6.3 + version: v1.7.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: From 67de2c17fe5aae478a3e7d8033712c8a788fa003 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 23 May 2017 12:35:02 +0200 Subject: [PATCH 067/638] *: default -route-prefix to root Whenever serving Prometheus or Alertmanager through non root ExternalURLs we require the reverse proxy to trim the URL to be to the root Prometheus/Alertmanager. --- manifests/alertmanager/alertmanager.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index f2e7372a..ebbfeddd 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: "main" spec: replicas: 3 - version: v0.6.2 + version: v0.7.0 From 77adbc079e98edf9e03d8d79a966e1a964ac7d12 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 9 Jun 2017 10:35:52 +0200 Subject: [PATCH 068/638] cut v0.10.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 97b1cafb..8b15d7d2 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.9.1 + image: quay.io/coreos/prometheus-operator:v0.10.0 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From 74cc46d984a8750beaf82e8bf6207915b9d02667 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 12 Jun 2017 14:42:18 +0200 Subject: [PATCH 069/638] *: update default versions and example usages --- manifests/alertmanager/alertmanager.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index ebbfeddd..91f46bb4 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: "main" spec: replicas: 3 - version: v0.7.0 + version: v0.7.1 From 9b1f9b6c477e7f86b6c618f64da3718cf3d1041b Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Mon, 12 Jun 2017 10:13:39 +0200 Subject: [PATCH 070/638] Fix basicAuth, swapping key and name --- manifests/examples/basic-auth/service-monitor.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/manifests/examples/basic-auth/service-monitor.yaml b/manifests/examples/basic-auth/service-monitor.yaml index 8d9549f0..52428b1d 100644 --- a/manifests/examples/basic-auth/service-monitor.yaml +++ b/manifests/examples/basic-auth/service-monitor.yaml @@ -8,11 +8,11 @@ spec: endpoints: - basicAuth: password: - key: basic-auth - name: password + name: basic-auth + key: password username: - key: basic-auth - name: user + name: basic-auth + key: user port: metrics namespaceSelector: matchNames: From 98cdf68a0cdbfa87da227a71a282ad3471f6d0cb Mon Sep 17 00:00:00 2001 From: chenxingyu Date: Tue, 13 Jun 2017 16:40:56 +0800 Subject: [PATCH 071/638] fix alert rule bug --- assets/prometheus/rules/etcd3.rules | 6 +++--- manifests/prometheus/prometheus-k8s-rules.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules index a3b2cddd..1b1621e4 100644 --- a/assets/prometheus/rules/etcd3.rules +++ b/assets/prometheus/rules/etcd3.rules @@ -76,7 +76,7 @@ LABELS { } ANNOTATIONS { summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", } # HTTP requests alerts @@ -117,7 +117,7 @@ LABELS { } ANNOTATIONS { summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", } # etcd member communication alerts @@ -132,7 +132,7 @@ LABELS { } ANNOTATIONS { summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", } # etcd proposal alerts diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index cb062db1..f27cf33b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -122,7 +122,7 @@ data: } ANNOTATIONS { summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", } # HTTP requests alerts @@ -163,7 +163,7 @@ data: } ANNOTATIONS { summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", } # etcd member communication alerts @@ -178,7 +178,7 @@ data: } ANNOTATIONS { summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", } # etcd proposal alerts From e1cda3fa7ddd27869c460af263b0e5fafb587996 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 13 Jun 2017 11:21:02 +0200 Subject: [PATCH 072/638] cut v0.10.1 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 8b15d7d2..d574b89f 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.10.0 + image: quay.io/coreos/prometheus-operator:v0.10.1 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From a1afce8707f1c14871147e2dd398551a0040f72c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 15 Jun 2017 09:34:59 +0200 Subject: [PATCH 073/638] alerting rules: replace severity with action --- assets/prometheus/rules/alertmanager.rules | 6 +++--- assets/prometheus/rules/general.rules | 11 ++++------- assets/prometheus/rules/kube-apiserver.rules | 4 ++-- assets/prometheus/rules/kube-controller-manager.rules | 2 +- assets/prometheus/rules/kube-scheduler.rules | 2 +- assets/prometheus/rules/kubelet.rules | 10 +++++----- assets/prometheus/rules/node.rules | 2 +- assets/prometheus/rules/prometheus.rules | 2 +- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules index 71bdc687..30a70ee3 100644 --- a/assets/prometheus/rules/alertmanager.rules +++ b/assets/prometheus/rules/alertmanager.rules @@ -4,7 +4,7 @@ ALERT AlertmanagerConfigInconsistent label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "Alertmanager configurations are inconsistent", @@ -17,7 +17,7 @@ ALERT AlertmanagerDownOrMissing sum by(job) (up) != 1 FOR 5m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Alertmanager down or not discovered", @@ -28,7 +28,7 @@ ALERT FailedReload IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Alertmanager configuration reload has failed", diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 9e26ab9a..7b406f07 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -4,7 +4,7 @@ Alert TargetDown IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Targets are down", @@ -15,9 +15,6 @@ Alert TargetDown ALERT DeadMansSwitch IF vector(1) - LABELS { - severity = "none", - } ANNOTATIONS { summary = "Alerting DeadMansSwitch", description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", @@ -29,7 +26,7 @@ ALERT TooManyOpenFileDescriptors IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "too many open file descriptors", @@ -43,7 +40,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "file descriptors soon exhausted", @@ -55,7 +52,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "file descriptors soon exhausted", diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index c041881a..be6dc97f 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -2,7 +2,7 @@ ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "API server unreachable", @@ -20,7 +20,7 @@ ALERT K8SApiServerLatency ) / 1e6 > 1.0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Kubernetes apiserver latency is high", diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules index f75e2768..90546273 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -2,7 +2,7 @@ ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Controller manager is down", diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules index 6eff4bcd..80e954dd 100644 --- a/assets/prometheus/rules/kube-scheduler.rules +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -2,7 +2,7 @@ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Scheduler is down", diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index cbcd576c..124d8dd0 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -2,7 +2,7 @@ ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Node status is NotReady", @@ -20,7 +20,7 @@ ALERT K8SManyNodesNotReady ) > 0.2 FOR 1m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Many K8s nodes are Not Ready", @@ -31,7 +31,7 @@ ALERT K8SKubeletDown IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 FOR 1h LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -42,7 +42,7 @@ ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -52,7 +52,7 @@ ALERT K8SKubeletDown ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Kubelet is close to pod limit", diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 8fd5b7d0..9844947a 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -2,7 +2,7 @@ ALERT NodeExporterDown IF up{job="node-exporter"} == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "node-exporter cannot be scraped", diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules index 05c278f1..c29ed6ca 100644 --- a/assets/prometheus/rules/prometheus.rules +++ b/assets/prometheus/rules/prometheus.rules @@ -2,7 +2,7 @@ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Prometheus configuration reload has failed", From 915677eaa20321c4a8c9d39e699dc846726d3c50 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 15 Jun 2017 10:45:51 +0200 Subject: [PATCH 074/638] Revert "alerting rules: replace severity with action" --- assets/prometheus/rules/alertmanager.rules | 6 +++--- assets/prometheus/rules/general.rules | 11 +++++++---- assets/prometheus/rules/kube-apiserver.rules | 4 ++-- assets/prometheus/rules/kube-controller-manager.rules | 2 +- assets/prometheus/rules/kube-scheduler.rules | 2 +- assets/prometheus/rules/kubelet.rules | 10 +++++----- assets/prometheus/rules/node.rules | 2 +- assets/prometheus/rules/prometheus.rules | 2 +- 8 files changed, 21 insertions(+), 18 deletions(-) diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules index 30a70ee3..71bdc687 100644 --- a/assets/prometheus/rules/alertmanager.rules +++ b/assets/prometheus/rules/alertmanager.rules @@ -4,7 +4,7 @@ ALERT AlertmanagerConfigInconsistent label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { - severity = "page" + severity = "critical" } ANNOTATIONS { summary = "Alertmanager configurations are inconsistent", @@ -17,7 +17,7 @@ ALERT AlertmanagerDownOrMissing sum by(job) (up) != 1 FOR 5m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "Alertmanager down or not discovered", @@ -28,7 +28,7 @@ ALERT FailedReload IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "Alertmanager configuration reload has failed", diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 7b406f07..9e26ab9a 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -4,7 +4,7 @@ Alert TargetDown IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "Targets are down", @@ -15,6 +15,9 @@ Alert TargetDown ALERT DeadMansSwitch IF vector(1) + LABELS { + severity = "none", + } ANNOTATIONS { summary = "Alerting DeadMansSwitch", description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", @@ -26,7 +29,7 @@ ALERT TooManyOpenFileDescriptors IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { - severity = "page" + severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", @@ -40,7 +43,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "file descriptors soon exhausted", @@ -52,7 +55,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { - severity = "page" + severity = "critical" } ANNOTATIONS { summary = "file descriptors soon exhausted", diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index be6dc97f..c041881a 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -2,7 +2,7 @@ ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m LABELS { - severity = "page" + severity = "critical" } ANNOTATIONS { summary = "API server unreachable", @@ -20,7 +20,7 @@ ALERT K8SApiServerLatency ) / 1e6 > 1.0 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "Kubernetes apiserver latency is high", diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules index 90546273..f75e2768 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -2,7 +2,7 @@ ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m LABELS { - severity = "page", + severity = "critical", } ANNOTATIONS { summary = "Controller manager is down", diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules index 80e954dd..6eff4bcd 100644 --- a/assets/prometheus/rules/kube-scheduler.rules +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -2,7 +2,7 @@ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - severity = "page", + severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index 124d8dd0..cbcd576c 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -2,7 +2,7 @@ ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h LABELS { - severity = "ticket", + severity = "warning", } ANNOTATIONS { summary = "Node status is NotReady", @@ -20,7 +20,7 @@ ALERT K8SManyNodesNotReady ) > 0.2 FOR 1m LABELS { - severity = "page", + severity = "critical", } ANNOTATIONS { summary = "Many K8s nodes are Not Ready", @@ -31,7 +31,7 @@ ALERT K8SKubeletDown IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 FOR 1h LABELS { - severity = "ticket", + severity = "warning", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -42,7 +42,7 @@ ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { - severity = "page", + severity = "critical", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -52,7 +52,7 @@ ALERT K8SKubeletDown ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { - severity = "ticket", + severity = "warning", } ANNOTATIONS { summary = "Kubelet is close to pod limit", diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 9844947a..8fd5b7d0 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -2,7 +2,7 @@ ALERT NodeExporterDown IF up{job="node-exporter"} == 0 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules index c29ed6ca..05c278f1 100644 --- a/assets/prometheus/rules/prometheus.rules +++ b/assets/prometheus/rules/prometheus.rules @@ -2,7 +2,7 @@ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "ticket" + severity = "warning" } ANNOTATIONS { summary = "Prometheus configuration reload has failed", From 81fdfaddb3bf3ef8a77b83d19b33fc345cc1f483 Mon Sep 17 00:00:00 2001 From: Christian Jauvin Date: Fri, 16 Jun 2017 16:32:07 -0400 Subject: [PATCH 075/638] Fix relative links in kube-prometheus readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e4ed7a7a..6dbc5629 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ hack/cluster-monitoring/teardown ## Monitoring custom services -The example manifests in [/manifests/examples/example-app](/manifests/examples/example-app) +The example manifests in [/manifests/examples/example-app](/contrib/kube-prometheus/manifests/examples/example-app) deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/service-monitor.md), which specifies how the example service should be monitored. @@ -120,7 +120,7 @@ service as well. > Note that minikube hides some components like etcd so to see the extend of > this setup we recommend setting up a [local cluster using bootkube](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node). -An example for bootkube's multi-node vagrant setup is [here](/manifests/etcd/etcd-bootkube-vagrant-multi.yaml). +An example for bootkube's multi-node vagrant setup is [here](/contrib/kube-prometheus/manifests/etcd/etcd-bootkube-vagrant-multi.yaml). > Hint: this is merely an example for a local setup. The addresses will have to > be adapted for a setup, that is not a single etcd bootkube created cluster. From dea5226b9ff5166eacabd40bf8c969f9a2ff322c Mon Sep 17 00:00:00 2001 From: jordanjennings Date: Sat, 17 Jun 2017 13:03:50 -0400 Subject: [PATCH 076/638] kube-prometheus: update serviceMonitorSelector In cleanup commit 9cd05a8c39c6ed03688859de92372d93af5cc6e2 the labels on the ServiceMonitors were changed from k8s-apps to k8s-app. This commit makes the Prometheus selector match that change. --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 63e9c3f7..df432c40 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -10,7 +10,7 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: - - {key: k8s-apps, operator: Exists} + - {key: k8s-app, operator: Exists} ruleSelector: matchLabels: role: prometheus-rulefiles From 4c42ab4fcc2d53f73cb2ea3b32822e0a92f3adfc Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 19 Jun 2017 11:32:23 +0200 Subject: [PATCH 077/638] kube-prometheus: fix correct selector --- .../prometheus/prometheus-k8s-service-monitor-alertmanager.yaml | 2 +- .../prometheus/prometheus-k8s-service-monitor-prometheus.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml index 29d68c82..e0e33f9d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -3,7 +3,7 @@ kind: ServiceMonitor metadata: name: alertmanager labels: - app: alertmanager + k8s-app: alertmanager spec: selector: matchLabels: diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml index be74cd6d..bfcb4e31 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -3,7 +3,7 @@ kind: ServiceMonitor metadata: name: prometheus labels: - app: prometheus + k8s-app: prometheus spec: selector: matchLabels: diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index df432c40..066997ee 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -9,7 +9,7 @@ spec: version: v1.7.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: - matchExpression: + matchExpressions: - {key: k8s-app, operator: Exists} ruleSelector: matchLabels: From a5533a4f6c2277f44dff67b3eca93f93770b51a7 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 28 Jun 2017 10:50:17 +0200 Subject: [PATCH 078/638] kube-prometheus: ensure triggering alerts on down targets --- assets/prometheus/rules/general.rules | 4 +-- assets/prometheus/rules/kube-apiserver.rules | 2 +- .../rules/kube-controller-manager.rules | 3 +- assets/prometheus/rules/kube-scheduler.rules | 3 +- assets/prometheus/rules/kubelet.rules | 14 ++++----- assets/prometheus/rules/node.rules | 4 +-- .../prometheus/prometheus-k8s-rules.yaml | 30 ++++++++++--------- 7 files changed, 32 insertions(+), 28 deletions(-) diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 9e26ab9a..3500d689 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -1,14 +1,14 @@ ### Up Alerting ### Alert TargetDown - IF 100 * (count(up == 0) / count(up)) > 3 + IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", - description = "More than {{ $value }}% of targets are down." + description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index c041881a..a7fdfddc 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -1,5 +1,5 @@ ALERT K8SApiserverDown - IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + IF absent(up{job="apiserver"} == 1) FOR 5m LABELS { severity = "critical" diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules index f75e2768..3157cd12 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -1,5 +1,5 @@ ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + IF absent(up{job="kube-controller-manager"} == 1) FOR 5m LABELS { severity = "critical", @@ -7,4 +7,5 @@ ALERT K8SControllerManagerDown ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules index 6eff4bcd..ee86017a 100644 --- a/assets/prometheus/rules/kube-scheduler.rules +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -1,5 +1,5 @@ ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + IF absent(up{job="kube-scheduler"} == 1) FOR 5m LABELS { severity = "critical", @@ -7,4 +7,5 @@ ALERT K8SSchedulerDown ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index cbcd576c..8c0843ce 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -11,24 +11,24 @@ ALERT K8SNodeNotReady ALERT K8SManyNodesNotReady IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_ready{condition="true"} == 0) > 1 AND ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_ready{condition="true"} == 0) / - count by (cluster) (kube_node_status_ready{condition="true"}) + count(kube_node_status_ready{condition="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + summary = "Many Kubernetes nodes are Not Ready", + description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown - IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", @@ -39,7 +39,7 @@ ALERT K8SKubeletDown } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 8fd5b7d0..36ea482c 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -1,10 +1,10 @@ ALERT NodeExporterDown - IF up{job="node-exporter"} == 0 + IF absent(up{job="node-exporter"} == 1) FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m.", + description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index f27cf33b..181a70c7 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -225,14 +225,14 @@ data: ### Up Alerting ### Alert TargetDown - IF 100 * (count(up == 0) / count(up)) > 3 + IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", - description = "More than {{ $value }}% of targets are down." + description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### @@ -287,7 +287,7 @@ data: } kube-apiserver.rules: |+ ALERT K8SApiserverDown - IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + IF absent(up{job="apiserver"} == 1) FOR 5m LABELS { severity = "critical" @@ -316,7 +316,7 @@ data: } kube-controller-manager.rules: |+ ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + IF absent(up{job="kube-controller-manager"} == 1) FOR 5m LABELS { severity = "critical", @@ -324,6 +324,7 @@ data: ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", } kubelet.rules: |+ ALERT K8SNodeNotReady @@ -339,24 +340,24 @@ data: ALERT K8SManyNodesNotReady IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_ready{condition="true"} == 0) > 1 AND ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_ready{condition="true"} == 0) / - count by (cluster) (kube_node_status_ready{condition="true"}) + count(kube_node_status_ready{condition="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + summary = "Many Kubernetes nodes are Not Ready", + description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown - IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", @@ -367,7 +368,7 @@ data: } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", @@ -560,7 +561,7 @@ data: histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 kube-scheduler.rules: |+ ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + IF absent(up{job="kube-scheduler"} == 1) FOR 5m LABELS { severity = "critical", @@ -568,17 +569,18 @@ data: ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", } node.rules: |+ ALERT NodeExporterDown - IF up{job="node-exporter"} == 0 + IF absent(up{job="node-exporter"} == 1) FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m.", + description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } prometheus.rules: |+ ALERT FailedReload From 5a5bfab855b6954b536be73f426db311cf902d1c Mon Sep 17 00:00:00 2001 From: jordanjennings Date: Thu, 29 Jun 2017 12:44:53 -0400 Subject: [PATCH 079/638] kube-prometheus: Update RBAC for kube-state-metrics kube-state-metrics needs additional permissions for replicationcontrollers, limitranges, and services (otherwise the API server logs are flooded with RBAC DENY messages) --- .../kube-state-metrics/kube-state-metrics-cluster-role.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml index e4b30cf5..833afdec 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -7,7 +7,10 @@ rules: resources: - nodes - pods + - services - resourcequotas + - replicationcontrollers + - limitranges verbs: ["list", "watch"] - apiGroups: ["extensions"] resources: From 1db18ae87c8e1e140a6e4e9c0434d4f13932161e Mon Sep 17 00:00:00 2001 From: Jean-Paul Calderone Date: Fri, 30 Jun 2017 12:31:39 -0400 Subject: [PATCH 080/638] bump patch level in grafana-watcher version --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index b2ae3f8f..a45f893e 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.4 + image: quay.io/coreos/grafana-watcher:v0.0.5 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From c0a3447c01825dbb75a928b0d23c03a3a70ebcd1 Mon Sep 17 00:00:00 2001 From: Gabi Davar Date: Sat, 8 Jul 2017 14:29:23 +0300 Subject: [PATCH 081/638] remove duplicate `volumeMount` entry. --- manifests/grafana/grafana-deployment.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a45f893e..6f3e72d4 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -56,9 +56,6 @@ spec: secretKeyRef: name: grafana-credentials key: password - volumeMounts: - - name: grafana-dashboards - mountPath: /var/grafana-dashboards resources: requests: memory: "16Mi" From eae253db74ce10117dbef9a8ee3c6cc513ba043a Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 12 Jul 2017 12:57:32 -0400 Subject: [PATCH 082/638] Scrapes cAdvisor port for metrics in Kubernetes 1.7 --- .../prometheus/prometheus-k8s-service-monitor-kubelet.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index cdc3ffb6..0eac9630 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -9,6 +9,8 @@ spec: endpoints: - port: http-metrics interval: 30s + - port: cadvisor + interval: 30s honorLabels: true selector: matchLabels: From c97a329792e65665a2ad1ee2fab11b14bd25d3d9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 29 Jun 2017 16:32:51 +0200 Subject: [PATCH 083/638] kube-prometheus: run prometheus-k8s with only those roles it needs --- hack/cluster-monitoring/deploy | 4 +- hack/cluster-monitoring/teardown | 4 +- .../prometheus-cluster-role-binding.yaml | 12 ----- .../prometheus/prometheus-cluster-role.yaml | 18 ------- .../prometheus-k8s-role-bindings.yaml | 54 +++++++++++++++++++ .../prometheus/prometheus-k8s-roles.yaml | 50 +++++++++++++++++ 6 files changed, 110 insertions(+), 32 deletions(-) delete mode 100644 manifests/prometheus/prometheus-cluster-role-binding.yaml delete mode 100644 manifests/prometheus/prometheus-cluster-role.yaml create mode 100644 manifests/prometheus/prometheus-k8s-role-bindings.yaml create mode 100644 manifests/prometheus/prometheus-k8s-roles.yaml diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index c565d442..9176b956 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter kctl apply -f manifests/kube-state-metrics kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana -kctl apply -f manifests/prometheus/ +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; +kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl apply -f manifests/alertmanager/ diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index 9fcc4513..ac4d222d 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -15,7 +15,9 @@ kctl() { kctl delete -f manifests/node-exporter kctl delete -f manifests/kube-state-metrics kctl delete -f manifests/grafana -kctl delete -f manifests/prometheus +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; +kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl delete -f manifests/alertmanager # Hack: wait a bit to let the controller delete the deployed Prometheus server. diff --git a/manifests/prometheus/prometheus-cluster-role-binding.yaml b/manifests/prometheus/prometheus-cluster-role-binding.yaml deleted file mode 100644 index 3600490f..00000000 --- a/manifests/prometheus/prometheus-cluster-role-binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml deleted file mode 100644 index a85422ec..00000000 --- a/manifests/prometheus/prometheus-cluster-role.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests/prometheus/prometheus-k8s-role-bindings.yaml new file mode 100644 index 00000000..5f190e7a --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-role-bindings.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml new file mode 100644 index 00000000..7a3efa90 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -0,0 +1,50 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: monitoring +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: kube-system +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: default +rules: +- apiGroups: [""] + resources: + - services + - endpoints + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- nonResourceURLs: ["/metrics"] + verbs: ["get"] From 6571c71e8ca00cced4373ae664b8da99f0437fcb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 20 Jul 2017 17:49:54 +0200 Subject: [PATCH 084/638] *: bump version to v0.11.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index d574b89f..64635192 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -14,7 +14,7 @@ spec: serviceAccountName: prometheus-operator containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.10.1 + image: quay.io/coreos/prometheus-operator:v0.11.0 args: - "--kubelet-service=kube-system/kubelet" - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" From 817bd1da972610caa0d822100cd4782d9b19db16 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 24 Jul 2017 15:25:31 +0200 Subject: [PATCH 085/638] kube-prometheus/docs: add docs on how to modify assets --- Makefile | 3 +++ docs/developing-alerts-and-dashboards.md | 27 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 Makefile create mode 100644 docs/developing-alerts-and-dashboards.md diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..04bd205a --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +generate: + @echo ">> Compiling assets and generating Kubernetes manifests" + @hack/scripts/generate-manifests.sh diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md new file mode 100644 index 00000000..80630940 --- /dev/null +++ b/docs/developing-alerts-and-dashboards.md @@ -0,0 +1,27 @@ +# Developing Alerts and Dashboards + +`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added. + +For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. + +The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. + +## Alerts + +The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. + +It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`. + +To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest. + +Then the generated manifest can be applied against a Kubernetes cluster. + +## Dashboards + +The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. + +As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. + +To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests. + +Then the generated manifest can be applied against a Kubernetes cluster. From f63473a2697e9b730cc32832527c4068bb893c05 Mon Sep 17 00:00:00 2001 From: Wei Wei Date: Mon, 10 Jul 2017 13:07:27 +0800 Subject: [PATCH 086/638] add grafana chart for kube-prometheus --- manifests/grafana/grafana-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a45f893e..05ca87eb 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.1.1 + image: grafana/grafana:4.4.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.5 + image: quay.io/coreos/grafana-watcher:v0.0.6 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From 7010e32130932372c87562881bb0d4908cc12ad0 Mon Sep 17 00:00:00 2001 From: Zachary Yonash Date: Thu, 27 Jul 2017 03:49:25 -0400 Subject: [PATCH 087/638] Added a few extra node rules (#478) --- assets/prometheus/rules/node.rules | 32 +++++++++++++++++++ .../prometheus/prometheus-k8s-rules.yaml | 32 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 36ea482c..54085392 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -8,3 +8,35 @@ ALERT NodeExporterDown summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } +ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + +ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + +ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 181a70c7..e1740562 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -582,6 +582,38 @@ data: summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } + ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + + ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + + ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } prometheus.rules: |+ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 From 154456ad799b1b12c97d6956cc4e2d1dfcb25d74 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 27 Jul 2017 14:17:57 +0200 Subject: [PATCH 088/638] generate Prometheus Operator deployments with jsonnet (#508) * *: use jsonnet to generate manifests * generate Prometheus Operator manifests with jsonnet * add jsonnet dockenfile for generating with jenkins --- .../prometheus-operator.yaml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 64635192..b2c37fd1 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -1,9 +1,9 @@ apiVersion: extensions/v1beta1 kind: Deployment metadata: - name: prometheus-operator labels: k8s-app: prometheus-operator + name: prometheus-operator spec: replicas: 1 template: @@ -11,20 +11,20 @@ spec: labels: k8s-app: prometheus-operator spec: - serviceAccountName: prometheus-operator containers: - - name: prometheus-operator + - args: + - --kubelet-service=kube-system/kubelet + - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 image: quay.io/coreos/prometheus-operator:v0.11.0 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + name: prometheus-operator ports: - - name: http - containerPort: 8080 + - containerPort: 8080 + name: http resources: + limits: + cpu: 200m + memory: 100Mi requests: cpu: 100m memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + serviceAccountName: prometheus-operator From caeaaf52ea613f40d8c2e8913cca9fd9e24a96ff Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 28 Jul 2017 12:06:00 +0200 Subject: [PATCH 089/638] *: bump version --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index b2c37fd1..29bbf746 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.11.0 + image: quay.io/coreos/prometheus-operator:v0.11.1 name: prometheus-operator ports: - containerPort: 8080 From d34811cbe9c2656d83483bc3399d2812a7e22234 Mon Sep 17 00:00:00 2001 From: xvzup Date: Tue, 1 Aug 2017 18:10:23 +0200 Subject: [PATCH 090/638] Update prometheus-k8s-roles.yaml Resource pods added to role prometheus-k8s for namespace default. This is required to monitor kube-apiserver. --- manifests/prometheus/prometheus-k8s-roles.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml index 7a3efa90..14302ea0 100644 --- a/manifests/prometheus/prometheus-k8s-roles.yaml +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -39,6 +39,7 @@ rules: resources: - services - endpoints + - pods verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1beta1 From f201c79f5dd8a244d6779c3168d0ec78912e40c3 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 2 Aug 2017 11:37:22 +0200 Subject: [PATCH 091/638] hints and notes added to handle grafana dashboards Added some comments to the doc, to: - Point the directory where make generate should be executed from - Mention that the dashboard file should be suffixed with "-dashboard.json". - Added hints to apply the new configMap and restart grafana --- docs/developing-alerts-and-dashboards.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md index 80630940..908e1132 100644 --- a/docs/developing-alerts-and-dashboards.md +++ b/docs/developing-alerts-and-dashboards.md @@ -6,6 +6,8 @@ For both, the Prometheus alerting rules as well as the Grafana dashboards, there The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. +Note: `make generate` should be executed from kube-prometheus base directory. + ## Alerts The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. @@ -22,6 +24,12 @@ The `ConfigMap` that is generated and holds the dashboard definitions can be fou As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. -To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests. +To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests (executing `make generate` from kube-prometheus base directory). -Then the generated manifest can be applied against a Kubernetes cluster. +Note: The dashboard json file to be copied in `assets/grafana/` should be suffixed with `-dashboard.json`, otherwise it won't be processed by `make generate`. + +Then the generated manifest can be applied against a Kubernetes cluster with something like: +`kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml` + +And to apply the changes, restart grafana: +`kubectl -n monitoring delete pod -l app=grafana` From e1db00daedefb42ba65d69af8d090bed22b41c28 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 2 Aug 2017 11:38:26 +0200 Subject: [PATCH 092/638] Update developing-alerts-and-dashboards.md --- docs/developing-alerts-and-dashboards.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md index 908e1132..cae67926 100644 --- a/docs/developing-alerts-and-dashboards.md +++ b/docs/developing-alerts-and-dashboards.md @@ -29,7 +29,11 @@ To edit/create a dashboard login to Grafana and modify and save the dashboard. T Note: The dashboard json file to be copied in `assets/grafana/` should be suffixed with `-dashboard.json`, otherwise it won't be processed by `make generate`. Then the generated manifest can be applied against a Kubernetes cluster with something like: -`kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml` +``` +kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml +``` And to apply the changes, restart grafana: -`kubectl -n monitoring delete pod -l app=grafana` +``` +kubectl -n monitoring delete pod -l app=grafana +``` From 4e4b00bb5cdb7ce11d77f6a23b49b3aaf5036157 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 2 Aug 2017 11:42:24 +0200 Subject: [PATCH 093/638] link in dashboard section Including a link in dashboard section to the document with all the details (docs/developing-alerts-and-dashboards.md) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6dbc5629..82cb7a80 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ sidecar container aims to emulate the behavior, by keeping the Grafana database with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. +For information about how to update/handle the dashboards check [Developing alerts and dashboards](docs/developing-alerts-and-dashboards.md) doc. + In the future, a separate Grafana operator will support gathering dashboards from multiple ConfigMaps based on label selection. From bcb2db7ce5986f5121b525e90d7dbf0fbf93793a Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 2 Aug 2017 12:08:03 +0200 Subject: [PATCH 094/638] Removed grafana restart comment Removed grafana restart comment from doc, as it's not necessary (grafana-watcher should pick up the changes and reload the dashboards). --- docs/developing-alerts-and-dashboards.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md index cae67926..bfba9f0d 100644 --- a/docs/developing-alerts-and-dashboards.md +++ b/docs/developing-alerts-and-dashboards.md @@ -32,8 +32,4 @@ Then the generated manifest can be applied against a Kubernetes cluster with som ``` kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml ``` - -And to apply the changes, restart grafana: -``` -kubectl -n monitoring delete pod -l app=grafana -``` +That will update the ConfigMap `grafana-dashboards`. Change should be automatically detected by grafana-watcher and dashboards reloaded. From 66dd7b253dfdbdf44e411e82a8fae29d632e02e1 Mon Sep 17 00:00:00 2001 From: eedugon Date: Wed, 2 Aug 2017 12:09:44 +0200 Subject: [PATCH 095/638] Hints and notes added to handle grafana dashboards (#534) --- README.md | 2 ++ docs/developing-alerts-and-dashboards.md | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6dbc5629..82cb7a80 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ sidecar container aims to emulate the behavior, by keeping the Grafana database with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. +For information about how to update/handle the dashboards check [Developing alerts and dashboards](docs/developing-alerts-and-dashboards.md) doc. + In the future, a separate Grafana operator will support gathering dashboards from multiple ConfigMaps based on label selection. diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md index 80630940..bfba9f0d 100644 --- a/docs/developing-alerts-and-dashboards.md +++ b/docs/developing-alerts-and-dashboards.md @@ -6,6 +6,8 @@ For both, the Prometheus alerting rules as well as the Grafana dashboards, there The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. +Note: `make generate` should be executed from kube-prometheus base directory. + ## Alerts The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. @@ -22,6 +24,12 @@ The `ConfigMap` that is generated and holds the dashboard definitions can be fou As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. -To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests. +To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests (executing `make generate` from kube-prometheus base directory). -Then the generated manifest can be applied against a Kubernetes cluster. +Note: The dashboard json file to be copied in `assets/grafana/` should be suffixed with `-dashboard.json`, otherwise it won't be processed by `make generate`. + +Then the generated manifest can be applied against a Kubernetes cluster with something like: +``` +kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml +``` +That will update the ConfigMap `grafana-dashboards`. Change should be automatically detected by grafana-watcher and dashboards reloaded. From afc95c87dde71a8fc4a0d2772cc8e8c524bfa06f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reinhard=20N=C3=A4gele?= Date: Fri, 11 Aug 2017 14:35:38 +0200 Subject: [PATCH 096/638] Use grafana watcher v0.0.7 throughout the repo --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 569fdfdb..6d399e6a 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.6 + image: quay.io/coreos/grafana-watcher:v0.0.7 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From b75d263567d524aead965e52ab89c567914b614a Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Tue, 15 Aug 2017 17:18:09 +0200 Subject: [PATCH 097/638] grafana configmap generator integrated --- hack/grafana-dashboards-configmap-generator | 1 + hack/scripts/generate-manifests.sh | 9 ++++++++- manifests/grafana/grafana-dashboards.yaml | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) create mode 160000 hack/grafana-dashboards-configmap-generator diff --git a/hack/grafana-dashboards-configmap-generator b/hack/grafana-dashboards-configmap-generator new file mode 160000 index 00000000..f901955e --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator @@ -0,0 +1 @@ +Subproject commit f901955e8c95b8dd7f3c038caabc0a8d41eec125 diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index 280bc121..64287dae 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -4,7 +4,14 @@ hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap -hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml +#hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml + +# Generate Dashboard ConfigMap with configmap-generator tool +# Max Size per ConfigMap: 240000 +# Input dir: assets/grafana +# output file: manifests/grafana/grafana-dashboards.yaml +test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml +hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana -o manifests/grafana/grafana-dashboards.yaml # Generate Grafana Credentials Secret hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 92a88fbf..a1179552 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -3425,3 +3425,4 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } +--- From e48278f3973bb88fe47c41ac46dbc02264025b78 Mon Sep 17 00:00:00 2001 From: crandl201 Date: Thu, 17 Aug 2017 20:05:55 -0400 Subject: [PATCH 098/638] update kube-state rules for 1.0.0 --- assets/prometheus/rules/kubelet.rules | 8 ++++---- assets/prometheus/rules/node.rules | 6 +++--- manifests/prometheus/prometheus-k8s-rules.yaml | 14 +++++++------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index 8c0843ce..0d47d9d7 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -1,5 +1,5 @@ ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 + IF kube_node_status_condition{condition="Ready", status="true"} == 0 FOR 1h LABELS { severity = "warning", @@ -11,12 +11,12 @@ ALERT K8SNodeNotReady ALERT K8SManyNodesNotReady IF - count(kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 AND ( - count(kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_condition{condition="Ready", status="true"} == 0) / - count(kube_node_status_ready{condition="true"}) + count(kube_node_status_condition{condition="Ready", status="true"}) ) > 0.2 FOR 1m LABELS { diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 54085392..94af0990 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -9,7 +9,7 @@ ALERT NodeExporterDown description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } ALERT K8SNodeOutOfDisk - IF kube_node_status_out_of_disk{condition="true"} == 1 + IF kube_node_status_condition{condition"OutOfDisk", status="true"} == 1 LABELS { service = "k8s", severity = "critical" @@ -20,7 +20,7 @@ ALERT K8SNodeOutOfDisk } ALERT K8SNodeMemoryPressure - IF kube_node_status_memory_pressure{condition="true"} == 1 + IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" @@ -31,7 +31,7 @@ ALERT K8SNodeMemoryPressure } ALERT K8SNodeDiskPressure - IF kube_node_status_disk_pressure{condition="true"} == 1 + IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index e1740562..a4459b43 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -328,7 +328,7 @@ data: } kubelet.rules: |+ ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 + IF kube_node_status_condition{condition="Ready", status="true"} == 0 FOR 1h LABELS { severity = "warning", @@ -340,12 +340,12 @@ data: ALERT K8SManyNodesNotReady IF - count(kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 AND ( - count(kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_condition{condition="Ready", status="true"} == 0) / - count(kube_node_status_ready{condition="true"}) + count(kube_node_status_condition{condition="Ready", status="true"}) ) > 0.2 FOR 1m LABELS { @@ -583,7 +583,7 @@ data: description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } ALERT K8SNodeOutOfDisk - IF kube_node_status_out_of_disk{condition="true"} == 1 + IF kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 LABELS { service = "k8s", severity = "critical" @@ -594,7 +594,7 @@ data: } ALERT K8SNodeMemoryPressure - IF kube_node_status_memory_pressure{condition="true"} == 1 + IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" @@ -605,7 +605,7 @@ data: } ALERT K8SNodeDiskPressure - IF kube_node_status_disk_pressure{condition="true"} == 1 + IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 LABELS { service = "k8s", severity = "warning" From abc7bcd11a3be4907d81901ab3b71807dd7322f0 Mon Sep 17 00:00:00 2001 From: Sam Manzer Date: Tue, 22 Aug 2017 10:09:25 -0500 Subject: [PATCH 099/638] bump grafana-watcher version to v0.0.8 --- manifests/grafana/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 6d399e6a..65dc78e4 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.7 + image: quay.io/coreos/grafana-watcher:v0.0.8 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' From f87fd9cc0a7f6c0f87a1443d743e9a064a803642 Mon Sep 17 00:00:00 2001 From: Ryan Walls Date: Tue, 22 Aug 2017 17:09:37 -0600 Subject: [PATCH 100/638] Update kube-prometheus to use kube-state-metrics v1.0.0 --- .../kube-state-metrics-cluster-role.yaml | 10 ++++++++++ .../kube-state-metrics-deployment.yaml | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml index 833afdec..6ae8db88 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -11,6 +11,7 @@ rules: - resourcequotas - replicationcontrollers - limitranges + - persistentvolumeclaims verbs: ["list", "watch"] - apiGroups: ["extensions"] resources: @@ -18,3 +19,12 @@ rules: - deployments - replicasets verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] \ No newline at end of file diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 7d98d43e..3b24611b 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -12,7 +12,7 @@ spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v0.5.0 + image: quay.io/coreos/kube-state-metrics:v1.0.0 ports: - name: metrics containerPort: 8080 From 40a45fcf7dfec3e925523d322ec4131cba775f24 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 23 Aug 2017 14:36:00 +0200 Subject: [PATCH 101/638] *: adapt all manifests from v1alpha1 to v1 --- manifests/alertmanager/alertmanager.yaml | 8 ++++---- manifests/examples/basic-auth/service-monitor.yaml | 2 +- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- .../examples/example-app/servicemonitor-frontend.yaml | 2 +- .../prometheus-k8s-service-monitor-alertmanager.yaml | 2 +- .../prometheus-k8s-service-monitor-apiserver.yaml | 2 +- ...theus-k8s-service-monitor-kube-controller-manager.yaml | 2 +- .../prometheus-k8s-service-monitor-kube-scheduler.yaml | 2 +- ...prometheus-k8s-service-monitor-kube-state-metrics.yaml | 2 +- .../prometheus-k8s-service-monitor-kubelet.yaml | 2 +- .../prometheus-k8s-service-monitor-node-exporter.yaml | 2 +- ...rometheus-k8s-service-monitor-prometheus-operator.yaml | 2 +- .../prometheus-k8s-service-monitor-prometheus.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index 91f46bb4..bba9ff2e 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -1,9 +1,9 @@ -apiVersion: "monitoring.coreos.com/v1alpha1" -kind: "Alertmanager" +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager metadata: - name: "main" + name: main labels: - alertmanager: "main" + alertmanager: main spec: replicas: 3 version: v0.7.1 diff --git a/manifests/examples/basic-auth/service-monitor.yaml b/manifests/examples/basic-auth/service-monitor.yaml index 52428b1d..e62b9fa9 100644 --- a/manifests/examples/basic-auth/service-monitor.yaml +++ b/manifests/examples/basic-auth/service-monitor.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index c092d8e2..f0341d51 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: frontend diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml index 4ceaacd6..cc3d42fa 100644 --- a/manifests/examples/example-app/servicemonitor-frontend.yaml +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: frontend diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml index e0e33f9d..19669e3e 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: alertmanager diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml index 09a87c2e..40361f04 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-apiserver diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml index eef95a84..681c320d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-controller-manager diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml index 663f8cfb..6927f58e 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-scheduler diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml index a276702a..6563a4d4 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-state-metrics diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 0eac9630..60ddc0c4 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kubelet diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml index b68ed89f..e1b083bb 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: node-exporter diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml index 23c04073..0b8028e7 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml index bfcb4e31..c3d11e57 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: prometheus diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 066997ee..421c024d 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -1,4 +1,4 @@ -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: k8s From d5285e02881f44610625976f5cd2d4ec98c650c1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 24 Aug 2017 11:14:50 +0200 Subject: [PATCH 102/638] *: bump version to v0.12.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 29bbf746..5f6d02ea 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.11.1 + image: quay.io/coreos/prometheus-operator:v0.12.0 name: prometheus-operator ports: - containerPort: 8080 From 57efea743f9e458d6212dc8a80b708fa9d56e3b8 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 24 Aug 2017 14:01:26 +0200 Subject: [PATCH 103/638] kube-prometheus: update the Prometheus Operator RBAC roles --- .../prometheus-operator-cluster-role.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 2f248651..0a78305b 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -8,7 +8,13 @@ rules: resources: - thirdpartyresources verbs: - - create + - "*" +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - "*" - apiGroups: - monitoring.coreos.com resources: @@ -40,3 +46,7 @@ rules: resources: - nodes verbs: ["list", "watch"] +- apiGroups: [""] + resources: + - namespaces + verbs: ["list"] From 7708f8eb83aa6ab31449b0b2e57133570e60c45f Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 Aug 2017 09:37:24 +0200 Subject: [PATCH 104/638] kube-prometheus: adapt deploy script to wait for CRDs --- hack/cluster-monitoring/deploy | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 9176b956..d959029d 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -17,10 +17,13 @@ kctl() { kctl apply -f manifests/prometheus-operator # Wait for TPRs to be ready. -printf "Waiting for Operator to register third party objects..." -until kctl get servicemonitor > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done +printf "Waiting for Operator to register custom resource definitions..." +until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" kctl apply -f manifests/node-exporter From 146c024950d79fd4588c5e3581d7fb6f7f34eab3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 Aug 2017 15:53:09 +0200 Subject: [PATCH 105/638] kube-prometheus: bump kube-state-metrics to v1.0.1 release --- .../kube-state-metrics-deployment.yaml | 36 +++++++++++++++++-- .../kube-state-metrics-role-binding.yaml | 12 +++++++ .../kube-state-metrics-role.yaml | 15 ++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml create mode 100644 manifests/kube-state-metrics/kube-state-metrics-role.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 3b24611b..bd313f21 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -12,10 +12,16 @@ spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.0.0 + image: quay.io/coreos/kube-state-metrics:v1.0.1 ports: - name: metrics containerPort: 8080 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 resources: requests: memory: 100Mi @@ -23,4 +29,30 @@ spec: limits: memory: 200Mi cpu: 200m - + - name: addon-resizer + image: gcr.io/google_containers/addon-resizer:1.0 + resources: + limits: + cpu: 100m + memory: 30Mi + requests: + cpu: 100m + memory: 30Mi + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /pod_nanny + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=1m + - --memory=100Mi + - --extra-memory=2Mi + - --threshold=5 + - --deployment=kube-state-metrics diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml new file mode 100644 index 00000000..a93c3965 --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: +- kind: ServiceAccount + name: kube-state-metrics + diff --git a/manifests/kube-state-metrics/kube-state-metrics-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-role.yaml new file mode 100644 index 00000000..6bf21fb8 --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-role.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: kube-state-metrics-resizer +rules: +- apiGroups: [""] + resources: + - pods + verbs: ["get"] +- apiGroups: ["extensions"] + resources: + - deployments + resourceNames: ["kube-state-metrics"] + verbs: ["get", "update"] + From bbfabfaf4dcddd0fd0f0aa7b96ba852d58656704 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 22 Aug 2017 17:15:56 +0200 Subject: [PATCH 106/638] Don't name Grafana port --- manifests/grafana/grafana-deployment.yaml | 2 +- manifests/grafana/grafana-service.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 65dc78e4..1666deae 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -1,4 +1,4 @@ -apiVersion: extensions/v1beta1 +apiVersion: apps/v1beta1 kind: Deployment metadata: name: grafana diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana/grafana-service.yaml index adb26233..fbcee40d 100644 --- a/manifests/grafana/grafana-service.yaml +++ b/manifests/grafana/grafana-service.yaml @@ -7,9 +7,9 @@ metadata: spec: type: NodePort ports: - - name: web - port: 3000 + - port: 3000 protocol: TCP nodePort: 30902 + targetPort: web selector: app: grafana From 781d00bc5d0e2b8c738913334b9484c96598c2b3 Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Wed, 6 Sep 2017 11:58:14 +1000 Subject: [PATCH 107/638] Add doc to monitor a secure etcd cluster, typically set up via kube-aws --- ...monitor-an-external-secure-etcd-cluster.md | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 docs/How-to-monitor-an-external-secure-etcd-cluster.md diff --git a/docs/How-to-monitor-an-external-secure-etcd-cluster.md b/docs/How-to-monitor-an-external-secure-etcd-cluster.md new file mode 100644 index 00000000..e7a347fe --- /dev/null +++ b/docs/How-to-monitor-an-external-secure-etcd-cluster.md @@ -0,0 +1,164 @@ +# How to monitor a secure external etcd service with Prometheus Operator +This guide will help you monitor an external etcd cluster. When the etcd is not hosted inside Kubernetes. +This is often the case with the Kubernetes setup. This has been tested with kube-aws but same principals will apply to other tools. + +# Step 1 - Make the etcd certificates available to Prometheus pod +Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure. + +## a - Create the secrets in the namespace +Prometheus Operator allows us to mount secrets in the pod. By loading the secrets as files, they can be made available inside the Prometheus pod. + +`kubectl -n monitoring create secret generic etcd-certs --from-file=CREDENTIAL_PATH/etcd-client.pem --from-file=CREDENTIAL_PATH/etcd-client-key.pem --from-file=CREDENTIAL_PATH/ca.pem` + +where CREDENTIAL_PATH is the path to your etcd client credentials on your work machine. +(Kube-aws stores them inside the credential folder). + +## b - Get Promnetheus Operator to load the secret +In the previous step we have named the secret 'etcd-certs'. + +Edit prometheus-operator/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml and add the secret under the spec of the Prometheus object manifest: + +``` + secrets: + - etcd-certs +``` + +The manifest will look like that: +``` +apiVersion: monitoring.coreos.com/v1alpha1 +kind: Prometheus +metadata: + name: k8s + labels: + prometheus: k8s +spec: + replicas: 2 + secrets: + - etcd-certs + version: v1.7.0 +``` + +If your Prometheus Operator is already in place, update it: + +`kubectl -n monitoring replace -f contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml + +# Step 2 - Create the Service, endpoints and ServiceMonitor + +The below manifest creates a Service to expose etcd metrics (port 2379) +Replace IP_OF_YOUR_ETCD_NODE_[0/1/2] with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. + +In this example we use insecureSkipVerify: true as kube-aws default certiicates are not valid against the IP. They were created for the DNS. Depending on your use case, you might want to remove this flag or set it to false. (true required for kube-aws if using default certificate generators method) + +``` +apiVersion: v1 +kind: Service +metadata: + name: etcd-k8s + labels: + k8s-app: etcd +spec: + type: ClusterIP + clusterIP: None + ports: + - name: api + port: 2379 + protocol: TCP +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: etcd-k8s + labels: + k8s-app: etcd +subsets: +- addresses: + - ip: IP_OF_YOUR_ETCD_NODE_0 + nodeName: etcd0 + - ip: IP_OF_YOUR_ETCD_NODE_1 + nodeName: etcd1 + - ip: IP_OF_YOUR_ETCD_NODE_2 + nodeName: etcd2 + ports: + - name: api + port: 2379 + protocol: TCP +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: etcd-k8s + labels: + k8s-app: etcd-k8s +spec: + jobLabel: k8s-app + endpoints: + - port: api + interval: 30s + scheme: https + tlsConfig: + caFile: /etc/prometheus/secrets/etcd-certs/ca.pem + certFile: /etc/prometheus/secrets/etcd-certs/etcd-client.pem + keyFile: /etc/prometheus/secrets/etcd-certs/etcd-client-key.pem + insecureSkipVerify: true + selector: + matchLabels: + k8s-app: etcd + namespaceSelector: + matchNames: + - monitoring +``` + +# Step 3: Open the port + +You now need to allow the nodes Prometheus are running on to talk to the etcd on the port 2379 (if 2379 is the port used by etcd to expose the metrics) + +If using kube-aws, you will need to edit the etcd security group inbound, specifying the security group of your Kubernetes node (worker) as the source. + +## kube-aws and EIP or ENI inconsistency +With kube-aws, each etcd node has two IP addresses: + +* EC2 instance IP +* EIP or ENI (depending on the chosen method in yuour cluster.yaml) + +For some reason, some etcd node answer to :2379/metrics on the intance IP (eth0), some others on the EIP|ENI address (eth1). See issue https://github.com/kubernetes-incubator/kube-aws/issues/923 +It would be of course much better if we could hit the EPI/ENI all the time as they don't change even if the underlying EC2 intance goes down. +If specifying the Instance IP (eth0) in the Prometheus Operator ServiceMonitor, and the EC2 intance goes down, one would have to update the ServiceMonitor. + +Another idea woud be to use the DNS entries of etcd, but those are not currently supported for EndPoints objects in Kubernetes. + +# Step 4: verify + +Go to the Prometheus UI on :9090/config and check that you have an etcd job entry: +``` +- job_name: monitoring/etcd-k8s/0 + scrape_interval: 30s + scrape_timeout: 10s + ... +``` + +On the :9090/targets page, you should see "etcd" with the UP state. If not, check the Error column for more information. + +# Step 5: Grafana dashboard + +## Find a dashboard you like + +Try to load this dashboard: +https://grafana.com/dashboards/3070 + +## Save the dashboard in the configmap + +As documented here, [Developing Alerts and Dashboards](developing-alerts-and-dashboards.md), the Grafana instances are stateless. The dashboards are automatically re-loaded from the ConfigMap. +So if you load a dashboard through the Grafana UI, it won't be kept unless saved in ConfigMap + +Read [the document](developing-alerts-and-dashboards.md), but in summary: + +### Copy your dashboard: +Once you are happy with the dashboard, export it and move it to `prometheus-operator/contrib/kube-prometheus/assets/grafana/` (ending in "-dashboard.json") + +### Regenetate the grafana dashboard manifest: +`hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml` + +### Reload the manifest in Kubernetes: +` kubectl -n monitoring replace -f manifests/grafana/grafana-dashboards.yaml` + +After a few minutes your dasboard will be available permanently to all Grafana instances \ No newline at end of file From ebffe0dae67b803f9fefeaf3f6fcdab8ad416c37 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Thu, 7 Sep 2017 11:21:59 +0200 Subject: [PATCH 108/638] Refer to CRDs as opposed to TPRs --- hack/cluster-monitoring/deploy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index d959029d..818db48c 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -16,7 +16,7 @@ kctl() { kctl apply -f manifests/prometheus-operator -# Wait for TPRs to be ready. +# Wait for CRDs to be ready. printf "Waiting for Operator to register custom resource definitions..." until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done From dfd2ee2847a6a5a93da88d56240297450fa4d366 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 7 Sep 2017 13:44:12 +0200 Subject: [PATCH 109/638] assets: modify and add grafana dashboards --- assets/grafana/all-nodes-dashboard.json | 848 -- assets/grafana/deployment-dashboard.json | 1604 +-- ...kubernetes-capacity-planing-dashboard.json | 1048 ++ .../kubernetes-cluster-health-dashboard.json | 733 ++ .../kubernetes-cluster-status-dashboard.json | 896 ++ ...rnetes-control-plane-status-dashboard.json | 663 ++ assets/grafana/kubernetes-pods-dashboard.json | 398 - ...ubernetes-resource-requests-dashboard.json | 434 + assets/grafana/node-dashboard.json | 868 -- assets/grafana/nodes-dashboard.json | 892 ++ assets/grafana/pods-dashboard.json | 432 + .../grafana/resource-requests-dashboard.json | 424 - assets/prometheus/rules/node.rules | 7 +- manifests/grafana/grafana-dashboards.yaml | 9252 +++++++++++------ .../prometheus/prometheus-k8s-rules.yaml | 7 +- 15 files changed, 11841 insertions(+), 6665 deletions(-) delete mode 100644 assets/grafana/all-nodes-dashboard.json create mode 100644 assets/grafana/kubernetes-capacity-planing-dashboard.json create mode 100644 assets/grafana/kubernetes-cluster-health-dashboard.json create mode 100644 assets/grafana/kubernetes-cluster-status-dashboard.json create mode 100644 assets/grafana/kubernetes-control-plane-status-dashboard.json delete mode 100644 assets/grafana/kubernetes-pods-dashboard.json create mode 100644 assets/grafana/kubernetes-resource-requests-dashboard.json delete mode 100644 assets/grafana/node-dashboard.json create mode 100644 assets/grafana/nodes-dashboard.json create mode 100644 assets/grafana/pods-dashboard.json delete mode 100644 assets/grafana/resource-requests-dashboard.json diff --git a/assets/grafana/all-nodes-dashboard.json b/assets/grafana/all-nodes-dashboard.json deleted file mode 100644 index fd36d882..00000000 --- a/assets/grafana/all-nodes-dashboard.json +++ /dev/null @@ -1,848 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 4, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "All Nodes", - "version": 1 -} diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json index 8bc9f40c..1ec5a6d9 100644 --- a/assets/grafana/deployment-dashboard.json +++ b/assets/grafana/deployment-dashboard.json @@ -1,806 +1,816 @@ { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "200px", + "panels": [ { - "collapse": false, - "editable": true, - "height": "200px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "hideZero": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tagsQuery": "", - "type": "query", - "useTags": false + "name": "value to text", + "value": 1 }, { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "deployment", - "type": "query", - "useTags": false + "name": "range to text", + "value": 2 } - ] + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" }, - "time": { - "from": "now-6h", - "to": "now" + { + "collapse": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 2 -} + { + "collapse": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "avg": false, + "current": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 1 +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-capacity-planing-dashboard.json b/assets/grafana/kubernetes-capacity-planing-dashboard.json new file mode 100644 index 00000000..81336453 --- /dev/null +++ b/assets/grafana/kubernetes-capacity-planing-dashboard.json @@ -0,0 +1,1048 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 246, + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 12, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 276, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_info)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current number of Pods", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_node_status_capacity_pods)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Maximum capacity of pods", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster Pod Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80,90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Capacity Planing", + "version": 4 +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-health-dashboard.json b/assets/grafana/kubernetes-cluster-health-dashboard.json new file mode 100644 index 00000000..d04ff0c0 --- /dev/null +++ b/assets/grafana/kubernetes-cluster-health-dashboard.json @@ -0,0 +1,733 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 254, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Control Plane Components Down", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "Everything UP and healthy", + "value": "null" + }, + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"pending\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Pending", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Crashlooping Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_ready{condition!=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Not Ready", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_disk_pressure{condition=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Disk Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_memory_pressure{condition=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Memory Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Nodes Unschedulable", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Cluster Health", + "version": 12 +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-status-dashboard.json b/assets/grafana/kubernetes-cluster-status-dashboard.json new file mode 100644 index 00000000..1b6d5b35 --- /dev/null +++ b/assets/grafana/kubernetes-cluster-status-dashboard.json @@ -0,0 +1,896 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 129, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Control Plane UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "UP", + "value": "null" + } + ], + "valueName": "total" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cluster Health", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 168, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Controller Managers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Crashlooping Control Plane Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Control Plane Status", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 158, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "CPU Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Memory Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Filesystem Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 10, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Capacity Planing", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Cluster Status", + "version": 22 +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-control-plane-status-dashboard.json b/assets/grafana/kubernetes-control-plane-status-dashboard.json new file mode 100644 index 00000000..03b547ac --- /dev/null +++ b/assets/grafana/kubernetes-control-plane-status-dashboard.json @@ -0,0 +1,663 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Controller Mangers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "5,10", + "title": "API Server Request Error Rate", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "API Server Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "End to end scheduling latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "dtdurations", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Error Rate", + "refId": "A", + "step": 60 + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "API Server Request Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Control Plane Status", + "version": 11 +} \ No newline at end of file diff --git a/assets/grafana/kubernetes-pods-dashboard.json b/assets/grafana/kubernetes-pods-dashboard.json deleted file mode 100644 index 3428f36b..00000000 --- a/assets/grafana/kubernetes-pods-dashboard.json +++ /dev/null @@ -1,398 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 10 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 26 -} diff --git a/assets/grafana/kubernetes-resource-requests-dashboard.json b/assets/grafana/kubernetes-resource-requests-dashboard.json new file mode 100644 index 00000000..fe52cba7 --- /dev/null +++ b/assets/grafana/kubernetes-resource-requests-dashboard.json @@ -0,0 +1,434 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 2 +} \ No newline at end of file diff --git a/assets/grafana/node-dashboard.json b/assets/grafana/node-dashboard.json deleted file mode 100644 index 9a831aaa..00000000 --- a/assets/grafana/node-dashboard.json +++ /dev/null @@ -1,868 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 4 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 4 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 4 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 1 -} diff --git a/assets/grafana/nodes-dashboard.json b/assets/grafana/nodes-dashboard.json new file mode 100644 index 00000000..7ab526c3 --- /dev/null +++ b/assets/grafana/nodes-dashboard.json @@ -0,0 +1,892 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 10 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 10 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 10 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 2 +} \ No newline at end of file diff --git a/assets/grafana/pods-dashboard.json b/assets/grafana/pods-dashboard.json new file mode 100644 index 00000000..2d3c1c84 --- /dev/null +++ b/assets/grafana/pods-dashboard.json @@ -0,0 +1,432 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 1 +} \ No newline at end of file diff --git a/assets/grafana/resource-requests-dashboard.json b/assets/grafana/resource-requests-dashboard.json deleted file mode 100644 index e34315b8..00000000 --- a/assets/grafana/resource-requests-dashboard.json +++ /dev/null @@ -1,424 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to show the resource requests vs allocatable in the cluster", - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Resource Requests", - "version": 1 -} diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 94af0990..4f768671 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -8,8 +8,9 @@ ALERT NodeExporterDown summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } + ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition"OutOfDisk", status="true"} == 1 + IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 LABELS { service = "k8s", severity = "critical" @@ -18,7 +19,7 @@ ALERT K8SNodeOutOfDisk summary = "Node ran out of disk space.", description = "{{ $labels.node }} has run out of disk space.", } - + ALERT K8SNodeMemoryPressure IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 LABELS { @@ -29,7 +30,7 @@ ALERT K8SNodeMemoryPressure summary = "Node is under memory pressure.", description = "{{ $labels.node }} is under memory pressure.", } - + ALERT K8SNodeDiskPressure IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 LABELS { diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 92a88fbf..f7903dc2 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -3,1678 +3,825 @@ kind: ConfigMap metadata: name: grafana-dashboards data: - all-nodes-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 4, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "All Nodes", - "version": 1 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } deployment-dashboard.json: |+ { "dashboard": { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "200px", + "panels": [ { - "collapse": false, - "editable": true, - "height": "200px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "hideZero": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tagsQuery": "", - "type": "query", - "useTags": false + "name": "value to text", + "value": 1 }, { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "deployment", - "type": "query", - "useTags": false + "name": "range to text", + "value": 2 } - ] + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" }, - "time": { - "from": "now-6h", - "to": "now" + { + "collapse": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 2 - } - , + { + "collapse": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "avg": false, + "current": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 1 + }, "inputs": [ { "name": "DS_PROMETHEUS", @@ -1685,408 +832,1057 @@ data: ], "overwrite": true } - kubernetes-pods-dashboard.json: |+ + kubernetes-capacity-planing-dashboard.json: |+ { "dashboard": { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 10 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "type": "query" + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 } - ] + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" }, - "time": { - "from": "now-6h", - "to": "now" + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] + { + "collapse": false, + "height": 246, + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 12, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" }, - "timezone": "browser", - "title": "Pods", - "version": 26 - } - , + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 276, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_info)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current number of Pods", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_node_status_capacity_pods)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Maximum capacity of pods", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster Pod Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80,90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Capacity Planing", + "version": 4 + }, "inputs": [ { "name": "DS_PROMETHEUS", @@ -2097,878 +1893,742 @@ data: ], "overwrite": true } - node-dashboard.json: |+ + kubernetes-cluster-health-dashboard.json: |+ { "dashboard": { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 254, + "panels": [ { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 4 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 4 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 4 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 } - ] + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Control Plane Components Down", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "Everything UP and healthy", + "value": "null" + }, + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"pending\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Pending", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Crashlooping Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 1 - } - , + { + "collapse": false, + "height": 250, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_ready{condition!=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Not Ready", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_disk_pressure{condition=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Disk Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_memory_pressure{condition=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Node Memory Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Nodes Unschedulable", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Cluster Health", + "version": 12 + }, "inputs": [ { "name": "DS_PROMETHEUS", @@ -2979,434 +2639,3378 @@ data: ], "overwrite": true } - resource-requests-dashboard.json: |+ + kubernetes-cluster-status-dashboard.json: |+ { "dashboard": { - "__inputs": [ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 129, + "panels": [ { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Control Plane UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "UP", + "value": "null" + } + ], + "valueName": "total" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3,5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" } - ], - "__requires": [ + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cluster Health", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 168, + "panels": [ { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" }, { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Controller Managers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" }, { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" }, { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1,3", + "title": "Crashlooping Control Plane Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" } - ], - "annotations": { - "list": [] + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Control Plane Status", + "titleSize": "h6" }, - "description": "Dashboard to show the resource requests vs allocatable in the cluster", - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ + { + "collapse": false, + "height": 158, + "panels": [ { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "CPU Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" }, { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Memory", - "titleSize": "h6" + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Memory Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Filesystem Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 10, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "80,90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Capacity Planing", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Resource Requests", - "version": 1 + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Cluster Status", + "version": 22 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true } - , + kubernetes-control-plane-status-dashboard.json: |+ + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Controller Mangers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50,80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "5,10", + "title": "API Server Request Error Rate", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "API Server Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "End to end scheduling latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "dtdurations", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Error Rate", + "refId": "A", + "step": 60 + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "API Server Request Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes Control Plane Status", + "version": 11 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + kubernetes-resource-requests-dashboard.json: |+ + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 2 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + nodes-dashboard.json: |+ + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 10 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 10 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 10 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 2 + }, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } + pods-dashboard.json: |+ + { + "dashboard": + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 1 + }, "inputs": [ { "name": "DS_PROMETHEUS", diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index a4459b43..71282c51 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -582,8 +582,9 @@ data: summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } + ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 LABELS { service = "k8s", severity = "critical" @@ -592,7 +593,7 @@ data: summary = "Node ran out of disk space.", description = "{{ $labels.node }} has run out of disk space.", } - + ALERT K8SNodeMemoryPressure IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 LABELS { @@ -603,7 +604,7 @@ data: summary = "Node is under memory pressure.", description = "{{ $labels.node }} is under memory pressure.", } - + ALERT K8SNodeDiskPressure IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 LABELS { From ab9ba85f8b0fc1e491154cea42559be407431020 Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Fri, 8 Sep 2017 10:54:05 +1000 Subject: [PATCH 110/638] minor fixes to monitoring external etcd --- ...cd-cluster.md => Monitoring external etcd.md} | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) rename docs/{How-to-monitor-an-external-secure-etcd-cluster.md => Monitoring external etcd.md} (85%) diff --git a/docs/How-to-monitor-an-external-secure-etcd-cluster.md b/docs/Monitoring external etcd.md similarity index 85% rename from docs/How-to-monitor-an-external-secure-etcd-cluster.md rename to docs/Monitoring external etcd.md index e7a347fe..65dfe139 100644 --- a/docs/How-to-monitor-an-external-secure-etcd-cluster.md +++ b/docs/Monitoring external etcd.md @@ -1,6 +1,6 @@ -# How to monitor a secure external etcd service with Prometheus Operator -This guide will help you monitor an external etcd cluster. When the etcd is not hosted inside Kubernetes. -This is often the case with the Kubernetes setup. This has been tested with kube-aws but same principals will apply to other tools. +# Monitoring external etcd +This guide will help you monitor an external etcd cluster. When the etcd cluster is not hosted inside Kubernetes. +This is often the case with Kubernetes setups. This approach has been tested with kube-aws but the same principals apply to other tools. # Step 1 - Make the etcd certificates available to Prometheus pod Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure. @@ -45,7 +45,11 @@ If your Prometheus Operator is already in place, update it: # Step 2 - Create the Service, endpoints and ServiceMonitor The below manifest creates a Service to expose etcd metrics (port 2379) -Replace IP_OF_YOUR_ETCD_NODE_[0/1/2] with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. + +* Replace I`P_OF_YOUR_ETCD_NODE_[0/1/2]` with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. +* Use `#insecureSkipVerify: true` or replace `ETCD_DNS_OR_ALTERNAME_NAME` with a valid name for the certificate. + +In case you have generated the etcd certificated with kube-aws, you will need to use insecureSkipVerify as the valid certificate domain will be different for each etcd node (etcd0, etcd1, etcd2). If you only have one etcd node, you can use the value from `etcd.internalDomainName` speficied in your kube-aws `cluster.yaml` In this example we use insecureSkipVerify: true as kube-aws default certiicates are not valid against the IP. They were created for the DNS. Depending on your use case, you might want to remove this flag or set it to false. (true required for kube-aws if using default certificate generators method) @@ -99,7 +103,9 @@ spec: caFile: /etc/prometheus/secrets/etcd-certs/ca.pem certFile: /etc/prometheus/secrets/etcd-certs/etcd-client.pem keyFile: /etc/prometheus/secrets/etcd-certs/etcd-client-key.pem - insecureSkipVerify: true + #use insecureSkipVerify only if you cannot use a Subject Alternative Name + #insecureSkipVerify: true + serverName: ETCD_DNS_OR_ALTERNAME_NAME selector: matchLabels: k8s-app: etcd From 88fe225e8122702643cdae81ef8bd357f5ebf661 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 8 Sep 2017 18:13:20 +0800 Subject: [PATCH 111/638] add tolerations to allow node-exporter running on all nodes --- manifests/node-exporter/node-exporter-daemonset.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index d42413cb..b7696694 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -35,6 +35,9 @@ spec: - name: sys readOnly: true mountPath: /host/sys + tolerations: + - effect: NoSchedule + operator: Exists volumes: - name: proc hostPath: From d0aa27b0812bba11e45144633373b2dc8dcf3517 Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Sat, 9 Sep 2017 12:14:02 +0200 Subject: [PATCH 112/638] memory prometheus raised to 4g --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 421c024d..84bd02a8 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -21,7 +21,7 @@ spec: # memory. Modify based on your target and time-series count for # production use. This value is mainly meant for demonstration/testing # purposes. - memory: 400Mi + memory: 4Gi alerting: alertmanagers: - namespace: monitoring From 53b616e77d07810077c07bad2813e70aa02a941f Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Tue, 15 Aug 2017 17:24:22 +0200 Subject: [PATCH 113/638] deleted configmap generator --- hack/grafana-dashboards-configmap-generator | 1 - 1 file changed, 1 deletion(-) delete mode 160000 hack/grafana-dashboards-configmap-generator diff --git a/hack/grafana-dashboards-configmap-generator b/hack/grafana-dashboards-configmap-generator deleted file mode 160000 index f901955e..00000000 --- a/hack/grafana-dashboards-configmap-generator +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f901955e8c95b8dd7f3c038caabc0a8d41eec125 From 46671b7ee4c55bfb32d7dad87cc7143ac81514bd Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Tue, 15 Aug 2017 17:27:02 +0200 Subject: [PATCH 114/638] grafana configmap generator tool added --- .../README.md | 51 + .../bin/grafana_dashboards_generate.sh | 317 +++++++ .../output/README.md | 1 + .../templates/ConfigMap.header | 5 + .../templates/dashboard.foot | 11 + .../templates/dashboard.header | 2 + .../templates/grafana-dashboards/README.md | 1 + .../all-nodes-dashboard.json | 848 +++++++++++++++++ .../deployment-dashboard.json | 806 ++++++++++++++++ .../kubernetes-pods-dashboard.json | 398 ++++++++ .../grafana-dashboards/node-dashboard.json | 868 ++++++++++++++++++ .../prometheus-datasource.json | 7 + .../resource-requests-dashboard.json | 424 +++++++++ 13 files changed, 3739 insertions(+) create mode 100644 hack/grafana-dashboards-configmap-generator/README.md create mode 100755 hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh create mode 100644 hack/grafana-dashboards-configmap-generator/output/README.md create mode 100644 hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header create mode 100644 hack/grafana-dashboards-configmap-generator/templates/dashboard.foot create mode 100644 hack/grafana-dashboards-configmap-generator/templates/dashboard.header create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json diff --git a/hack/grafana-dashboards-configmap-generator/README.md b/hack/grafana-dashboards-configmap-generator/README.md new file mode 100644 index 00000000..cc4f51a3 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/README.md @@ -0,0 +1,51 @@ +# Grafana Dashboards Configmap Generator + +## Description: +Tool to maintain grafana dashboards' configmap for a grafana deployed with kube-prometheus (a tool inside prometheus-operator). + +The tool reads the content of a directory with grafana .json resources (dashboards and datasources) and creates a manifest file under output/ directory with all the content from the files in a Kubernetes ConfigMap format. + +Based on a configurable size limit, the tool will create 1 or N configmaps to allocate the .json resources (bin packing). If the limit is reached then the configmaps generated will have names like grafana-dashboards-0, grafana-dashboards-1, etc, and if the limit is not reached the configmap generated will be called "grafana-dashboards". + +Input Parameters Allowed: +```bash +-i dir, --input-dir dir + Directory with grafana dashboards to process. + Important notes: + Files should be suffixed with -dashboard.json or -datasource.json. + We don't recommend file names with spaces. + +-o file, --output-file file + Output file for config maps. + +-s NUM, --size-limit NUM + Size limit in bytes for each dashboard (default: 240000) + +-n namespace, --namespace namespace + Namespace for the configmap (default: monitoring). + +-x, --apply-configmap + Applies the generated configmap with kubectl. + +--apply-type + Type of kubectl command. Accepted values: apply, replace, create (default: apply). +``` + +## Usage + +Just execute the .sh under bin/ directory. The output will be placed in the output/ directory. + +Examples: +```bash +$ ./grafana_dashboards_generate.sh +$ bin/grafana_dashboards_generate.sh -o manifests/grafana/grafana-dashboards.yaml -i assets/grafana-dashboards +$ bin/grafana_dashboards_generate.sh -s 1000000 --apply-configmap --apply-type replace + +# Note: the output file, if provided with -o, shouldn't exist. +``` + +## Configuration and options + +* Put the json files you want to pack in the templates/grafana-dashboards/ directory +* Size limit default is 240000 bytes due to the annotations size limit in kubernetes of 256KB. + diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh new file mode 100755 index 00000000..259e81b9 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -0,0 +1,317 @@ +#!/bin/bash +# Author: eedugon + +# Description: Tool to maintain grafana dashboards configmap for a grafana deployed +# with kube-prometheus (a tool inside prometheus-operator) +# The tool reads the content of a directory with grafana .json resources +# that need to be moved into a configmap. +# Based on a configurable size limit, the tool will create 1 or N configmaps +# to allocate the .json resources (bin packing) + +# parameters +# -o, --output-file +# -i, --input-dir +# -s, --size-limit +# -x, --apply-configmap : true or false (default = false) +# --apply-type : create, replace, apply (default = apply) + +# +# Basic Functions +# +echoSyntax() { + echo "Usage: ${0} [options]" + echo "Options:" + echo -e "\t-i dir, --input-dir dir" + echo -e "\t\tDirectory with grafana dashboards to process." + echo -e "\t\tImportant notes:" + echo -e "\t\t\tFiles should be suffixed with -dashboard.json or -datasource.json." + echo -e "\t\t\tWe don't recommend file names with spaces." + echo + echo -e "\t-o file, --output-file file" + echo -e "\t\tOutput file for config maps." + echo + echo -e "\t-s NUM, --size-limit NUM" + echo -e "\t\tSize limit in bytes for each dashboard (default: 240000)" + echo + echo -e "\t-n namespace, --namespace namespace" + echo -e "\t\tNamespace for the configmap (default: monitoring)." + echo + echo -e "\t-x, --apply-configmap" + echo -e "\t\tApplies the generated configmap with kubectl." + echo + echo -e "\t--apply-type" + echo -e "\t\tType of kubectl command. Accepted values: apply, replace, create (default: apply)." +} + + +# # Apply changes --> environment allowed +# test -z "$APPLY_CONFIGMAP" && APPLY_CONFIGMAP="false" +# # Size limit --> environment set allowed +# test -z "$DATA_SIZE_LIMIT" && DATA_SIZE_LIMIT="240000" # in bytes +# # Changes type: in case of problems with k8s configmaps, try replace. Should be apply +# test -z "$APPLY_TYPE" && APPLY_TYPE="apply" +# # Input values verification +# echo "$DATA_SIZE_LIMIT" | grep -q "^[0-9]\+$" || { echo "ERROR: Incorrect value for DATA_SIZE_LIMIT: $DATA_SIZE_LIMIT. Number expected"; exit 1; } + +# Base variables (do not change them) +DATE_EXEC="$(date "+%Y-%m-%d-%H%M%S")" +BIN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +TOOL_HOME="$(dirname $BIN_DIR)" +SCRIPT_BASE=`basename $0 | sed "s/\.[Ss][Hh]//"` + +TEMPLATES_DIR="$TOOL_HOME/templates" +DASHBOARD_HEADER_FILE="$TEMPLATES_DIR/dashboard.header" +DASHBOARD_FOOT_FILE="$TEMPLATES_DIR/dashboard.foot" +CONFIGMAP_HEADER="$TEMPLATES_DIR/ConfigMap.header" +OUTPUT_BASE_DIR="$TOOL_HOME/output" + +# Some default values +OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-configMap-$DATE_EXEC.yaml" +DASHBOARDS_DIR="$TEMPLATES_DIR/grafana-dashboards" + +APPLY_CONFIGMAP="false" +APPLY_TYPE="apply" +DATA_SIZE_LIMIT="240000" +NAMESPACE="monitoring" + +# Input parameters +while (( "$#" )); do + case "$1" in + "-o" | "--output-file") + OUTPUT_FILE="$2" + shift + ;; + "-i" | "--input-dir") + DASHBOARDS_DIR="$2" + shift + ;; + "-n" | "--namespace") + NAMESPACE="$2" + shift + ;; + "-x" | "--apply-configmap") + APPLY_CONFIGMAP="true" + ;; + "--apply-type") + APPLY_TYPE="$2" + test "$APPLY_TYPE" != "create" && test "$APPLY_TYPE" != "apply" && test "$APPLY_TYPE" != "replace" && { echo "Unexpected APPLY_TYPE: $APPLY_TYPE"; exit 1; } + shift + ;; + "-s"|"--size-limit") + if ! ( echo $2 | grep -q '^[0-9]\+$') || [ $2 -eq 0 ]; then + echo "Invalid value for size limit '$2'" + exit 1 + fi + DATA_SIZE_LIMIT=$2 + shift + ;; + "-h"|"--help") + echoSyntax + exit 0 + ;; + *) + echo "Unknown argument: $1" + exit 1 + ;; + esac + shift +done + +# +# Main Functions +# +addConfigMapHeader() { + # If a parameter is provided it will be used as the configmap index. + # If no parameter is provided, the name will be kept + test "$#" -le 1 || { echo "# INTERNAL ERROR: Wrong call to function addConfigMapHeader"; return 1; } + local id="$1" + + if [ "$id" ]; then + cat "$CONFIGMAP_HEADER" | sed "s/name: grafana-dashboards/name: grafana-dashboards-$id/" + else + cat "$CONFIGMAP_HEADER" + fi +} + +addArrayToConfigMap() { + # This function process the array to_process into a configmap + + local OLDIFS=$IFS + local IFS=$'\n' + for file in ${to_process[@]}; do + # check that file exists + test -f "$file" || { echo "# INTERNAL ERROR IN ARRAY: File not found: $file"; continue; } + + # detection of type (dashboard or datasource) + type="" + basename "$file" | grep -q "\-datasource" && type="datasource" + basename "$file" | grep -q "\-dashboard" && type="dashboard" + test "$type" || { echo "# ERROR: Unrecognized file type: $(basename $file)"; return 1; } + + #echo "# Processing $type $file" + # Indent 2 + echo " $(basename $file): |+" + + # Dashboard header: No indent needed + test "$type" = "dashboard" && cat $DASHBOARD_HEADER_FILE + + # File content: Indent 4 + cat $file | sed "s/^/ /" + + # Dashboard foot + test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE + done + echo "---" + + IFS=$OLDIFS + return 0 +} + +initialize-bin-pack() { + # We separate initialization to reuse the bin-pack for different sets of files. + n="0" + to_process=() + bytes_to_process="0" + total_files_processed="0" + total_configmaps_created="0" +} + +bin-pack-files() { + # Algorithm: + # We process the files with no special order consideration + # We create an array/queue of "files to add to configmap" called "to_process" + # Size of the file is analyzed to determine if it can be added to the queue or not. + # the max size of the queue is limited by DATA_SIZE_LIMIT + # while there's room available in the queue we add files. + # when there's no room we create a configmap with the members of the queue + # before adding the file to a cleaned queue + + # Counters initialization is not in the scope of this function + local file="" + OLDIFS=$IFS + IFS=$'\n' +# echo "DEBUG bin-pack:" +# echo "$@" + + for file in $@; do + test -f "$file" || { echo "# INTERNAL ERROR: File not found: $file"; continue; } +# echo "debug: Processing file $(basename $file)" + + file_size_bytes="$(stat -c%s "$file")" + + # If the file is bigger than the configured limit we skip it file + if [ "$file_size_bytes" -gt "$DATA_SIZE_LIMIT" ]; then + echo "ERROR: File $(basename $file) bigger than size limit: $DATA_SIZE_LIMIT ($file_size_bytes). Skipping" + continue + fi + (( total_files_processed++ )) + + if test "$(expr "$bytes_to_process" + "$file_size_bytes")" -le "$DATA_SIZE_LIMIT"; then + # We have room to include the file in the configmap + # test "$to_process" && to_process="$to_process $file" || to_process="$file" + to_process+=("$file") + (( bytes_to_process = bytes_to_process + file_size_bytes )) + echo "# File $(basename $file) : added to queue" + else + # There's no room to add this file to the queue. so we process what we have and add the file to the queue + if [ "$to_process" ]; then + echo + echo "# Size limit ($DATA_SIZE_LIMIT) reached. Processing queue with $bytes_to_process bytes. Creating configmap with id $n" + echo + # Create a new configmap + addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } + addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } + # Initialize variables with info about file not processed + (( total_configmaps_created++ )) + (( n++ )) + # to_process="$file" + to_process=() + to_process+=("$file") + bytes_to_process="$file_size_bytes" + echo "# File $(basename $file) : added to queue" + else + # based on the algorithm the queue should never be empty if we reach this part of the code + # if this happens maybe bytes_to_process was not aligned with the queue (to_process) + echo "ERROR (unexpected)" + fi + fi + done + IFS=$OLDIFS +} + +# Some variables checks... +test ! -d "$TEMPLATES_DIR" && { echo "ERROR: missing templates directory $TEMPLATES_DIR"; exit 1; } + +test -f "$DASHBOARD_FOOT_FILE" || { echo "Template $DASHBOARD_FOOT_FILE not found"; exit 1; } +test -f "$DASHBOARD_HEADER_FILE" || { echo "Template $DASHBOARD_HEADER_FILE not found"; exit 1; } +test -f "$CONFIGMAP_HEADER" || { echo "Template $CONFIGMAP_HEADER not found"; exit 1; } + +test ! -d "$OUTPUT_BASE_DIR" && { echo "ERROR: missing directory $OUTPUT_BASE_DIR"; exit 1; } + +# Initial checks +test -d "$DASHBOARDS_DIR" || { echo "ERROR: Dashboards directory not found: $DASHBOARDS_DIR"; echoSyntax; exit 1; } + +test -f "$OUTPUT_FILE" && { echo "ERROR: Output file already exists: $OUTPUT_FILE"; exit 1; } +touch $OUTPUT_FILE || { echo "ERROR: Unable to create or modify $OUTPUT_FILE"; exit 1; } + +# Main code start + +echo "# Starting execution of $SCRIPT_BASE on $DATE_EXEC" +echo "# Configured size limit: $DATA_SIZE_LIMIT bytes" +echo "# Grafna input dashboards and datasources will be read from: $DASHBOARDS_DIR" +echo "# Grafana Dashboards ConfigMap will be created into file:" +echo "$OUTPUT_FILE" +echo + +# Loop variables initialization +initialize-bin-pack + +# Process dashboards +bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-dashboard.json" | sort)" + +# Continue processing datasources (maintaining the same queue) +bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-datasource.json" | sort )" + +# Processing remaining data in the queue (or unique) +if [ "$to_process" ]; then + if [ "$n" -eq 0 ]; then + echo + echo "# Size limit not reached ($bytes_to_process). Adding all files into basic configmap" + echo + addConfigMapHeader >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } + else + echo + echo "# Size limit not reached ($bytes_to_process). Adding remaining files into configmap with id $n" + echo + addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } + fi + addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } + (( total_configmaps_created++ )) + to_process=() +fi + +echo "# Process completed, configmap created: $(basename $OUTPUT_FILE)" +echo "# Summary" +echo "# Total files processed: $total_files_processed" +echo "# Total amount of ConfigMaps inside the manifest: $total_configmaps_created" + +# If output file is empty we can delete it and exit +test ! -s "$OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $OUTPUT_FILE; exit 0; } + +if [ "$APPLY_CONFIGMAP" = "true" ]; then + test -x "$(which kubectl)" || { echo "ERROR: kubectl command not available. Apply configmap not possible"; exit 1; } + echo + if kubectl -n $NAMESPACE $APPLY_TYPE -f "$OUTPUT_FILE"; then + echo + echo "# ConfigMap updated. Wait until grafana-watcher applies the changes and reloads the dashboards." + else + echo + echo "ERROR APPLYING CONFIGURATION. Check yaml file" + echo "$OUTPUT_FILE" + fi +else + echo + echo "# To apply the new configMap to your k8s system do something like:" + echo "kubectl -n monitoring $APPLY_TYPE -f $(basename $OUTPUT_FILE)" + echo +fi diff --git a/hack/grafana-dashboards-configmap-generator/output/README.md b/hack/grafana-dashboards-configmap-generator/output/README.md new file mode 100644 index 00000000..68590878 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/output/README.md @@ -0,0 +1 @@ +### This directory will include all generated manifests diff --git a/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header b/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header new file mode 100644 index 00000000..afc1f42c --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards +data: diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot b/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot new file mode 100644 index 00000000..81fe9f6f --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot @@ -0,0 +1,11 @@ + , + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.header b/hack/grafana-dashboards-configmap-generator/templates/dashboard.header new file mode 100644 index 00000000..807e5d38 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/dashboard.header @@ -0,0 +1,2 @@ + { + "dashboard": diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md new file mode 100644 index 00000000..69be0eec --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md @@ -0,0 +1 @@ +# Add your grafana dashboards into this directory diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json new file mode 100644 index 00000000..fd36d882 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json @@ -0,0 +1,848 @@ +{ + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 4, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 4, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "All Nodes", + "version": 1 +} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json new file mode 100644 index 00000000..8bc9f40c --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json @@ -0,0 +1,806 @@ +{ + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 2 +} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json new file mode 100644 index 00000000..3428f36b --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json @@ -0,0 +1,398 @@ +{ + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "3.1.1" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "schemaVersion": 12, + "sharedCrosshair": true, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 26 +} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json new file mode 100644 index 00000000..9a831aaa --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json @@ -0,0 +1,868 @@ +{ + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": true, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Idle cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 4 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 4 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 8, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 8 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk space usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted ", + "yaxis": 2 + } + ], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 1 +} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json new file mode 100644 index 00000000..47b8f1b2 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json @@ -0,0 +1,7 @@ +{ + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090" +} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json new file mode 100644 index 00000000..e34315b8 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json @@ -0,0 +1,424 @@ +{ + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "4.1.1" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to show the resource requests vs allocatable in the cluster", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "300", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 10 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Resource Requests", + "version": 1 +} From e517658b5729132302c3883eaefce8c4a3ff6d2e Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Tue, 15 Aug 2017 17:34:16 +0200 Subject: [PATCH 115/638] dashboards removed from tool --- .../output/README.md | 3 +- .../all-nodes-dashboard.json | 848 ----------------- .../deployment-dashboard.json | 806 ---------------- .../kubernetes-pods-dashboard.json | 398 -------- .../grafana-dashboards/node-dashboard.json | 868 ------------------ .../prometheus-datasource.json | 7 - .../resource-requests-dashboard.json | 424 --------- 7 files changed, 2 insertions(+), 3352 deletions(-) delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json diff --git a/hack/grafana-dashboards-configmap-generator/output/README.md b/hack/grafana-dashboards-configmap-generator/output/README.md index 68590878..47e06b89 100644 --- a/hack/grafana-dashboards-configmap-generator/output/README.md +++ b/hack/grafana-dashboards-configmap-generator/output/README.md @@ -1 +1,2 @@ -### This directory will include all generated manifests +### By default, this directory will include all generated manifests (if -o is not used) +### With -o, --output-file option we can force the tool to generate a file wherever we want diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json deleted file mode 100644 index fd36d882..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/all-nodes-dashboard.json +++ /dev/null @@ -1,848 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 4, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 4, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "All Nodes", - "version": 1 -} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json deleted file mode 100644 index 8bc9f40c..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/deployment-dashboard.json +++ /dev/null @@ -1,806 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "editable": true, - "height": "200px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "hideZero": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "deployment", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 2 -} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json deleted file mode 100644 index 3428f36b..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/kubernetes-pods-dashboard.json +++ /dev/null @@ -1,398 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "3.1.1" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 10 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - } - ], - "schemaVersion": 12, - "sharedCrosshair": true, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 26 -} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json deleted file mode 100644 index 9a831aaa..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/node-dashboard.json +++ /dev/null @@ -1,868 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 4 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 4 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 4 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 8, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 8 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 8 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "prometheus" - ], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 1 -} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json deleted file mode 100644 index 47b8f1b2..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/prometheus-datasource.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "access": "proxy", - "basicAuth": false, - "name": "prometheus", - "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090" -} diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json deleted file mode 100644 index e34315b8..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/resource-requests-dashboard.json +++ /dev/null @@ -1,424 +0,0 @@ -{ - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "__requires": [ - { - "id": "grafana", - "name": "Grafana", - "type": "grafana", - "version": "4.1.1" - }, - { - "id": "graph", - "name": "Graph", - "type": "panel", - "version": "" - }, - { - "id": "prometheus", - "name": "Prometheus", - "type": "datasource", - "version": "1.0.0" - }, - { - "id": "singlestat", - "name": "Singlestat", - "type": "panel", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to show the resource requests vs allocatable in the cluster", - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 10 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Resource Requests", - "version": 1 -} From 40718bed62b5b2a89377020acd081aec47ba3474 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 13 Sep 2017 12:26:34 +0200 Subject: [PATCH 116/638] Fix ServiceMonitor documentation link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 82cb7a80..ac102bb8 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ hack/cluster-monitoring/teardown The example manifests in [/manifests/examples/example-app](/contrib/kube-prometheus/manifests/examples/example-app) deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus -server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/service-monitor.md), +server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/design.md#servicemonitor), which specifies how the example service should be monitored. The Prometheus Operator will deploy and configure the desired Prometheus instance and continiously manage its life cycle. From ec4fabad9ca66059f4c58310201bea596bef35b9 Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Thu, 14 Sep 2017 12:46:00 +0200 Subject: [PATCH 117/638] set options added to main script --- .../bin/grafana_dashboards_generate.sh | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 259e81b9..1173a998 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -1,4 +1,12 @@ #!/bin/bash + +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u + # Author: eedugon # Description: Tool to maintain grafana dashboards configmap for a grafana deployed @@ -124,7 +132,7 @@ addConfigMapHeader() { # If a parameter is provided it will be used as the configmap index. # If no parameter is provided, the name will be kept test "$#" -le 1 || { echo "# INTERNAL ERROR: Wrong call to function addConfigMapHeader"; return 1; } - local id="$1" + test "$#" -eq 1 && local id="$1" || local id="" if [ "$id" ]; then cat "$CONFIGMAP_HEADER" | sed "s/name: grafana-dashboards/name: grafana-dashboards-$id/" @@ -197,20 +205,20 @@ bin-pack-files() { test -f "$file" || { echo "# INTERNAL ERROR: File not found: $file"; continue; } # echo "debug: Processing file $(basename $file)" - file_size_bytes="$(stat -c%s "$file")" + file_size_bytes="$(stat -c%s "$file")" || true # If the file is bigger than the configured limit we skip it file if [ "$file_size_bytes" -gt "$DATA_SIZE_LIMIT" ]; then echo "ERROR: File $(basename $file) bigger than size limit: $DATA_SIZE_LIMIT ($file_size_bytes). Skipping" continue fi - (( total_files_processed++ )) + (( total_files_processed++ )) || true if test "$(expr "$bytes_to_process" + "$file_size_bytes")" -le "$DATA_SIZE_LIMIT"; then # We have room to include the file in the configmap # test "$to_process" && to_process="$to_process $file" || to_process="$file" to_process+=("$file") - (( bytes_to_process = bytes_to_process + file_size_bytes )) + (( bytes_to_process = bytes_to_process + file_size_bytes )) || true echo "# File $(basename $file) : added to queue" else # There's no room to add this file to the queue. so we process what we have and add the file to the queue @@ -222,8 +230,8 @@ bin-pack-files() { addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } # Initialize variables with info about file not processed - (( total_configmaps_created++ )) - (( n++ )) + (( total_configmaps_created++ )) || true + (( n++ )) || true # to_process="$file" to_process=() to_process+=("$file") @@ -286,7 +294,7 @@ if [ "$to_process" ]; then addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } fi addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } - (( total_configmaps_created++ )) + (( total_configmaps_created++ )) || true to_process=() fi From 5a253f192bd7d842cb019f004bd4c165779ddb5c Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Thu, 14 Sep 2017 12:57:16 +0200 Subject: [PATCH 118/638] changed prometheus memory to default of 400Mi --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 84bd02a8..421c024d 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -21,7 +21,7 @@ spec: # memory. Modify based on your target and time-series count for # production use. This value is mainly meant for demonstration/testing # purposes. - memory: 4Gi + memory: 400Mi alerting: alertmanagers: - namespace: monitoring From 41972d362ca1a7d6bf70d4ffb3aa10706ddcb3ad Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Thu, 14 Sep 2017 15:26:45 +0200 Subject: [PATCH 119/638] added grafana-deployment updates when configmaps are generated --- .../bin/grafana_dashboards_generate.sh | 88 +++++++++++++++++-- .../output/README.md | 3 +- .../grafana-deployment-original.yaml | 74 ++++++++++++++++ .../grafana-deployment-template.yaml | 71 +++++++++++++++ hack/scripts/generate-manifests.sh | 4 +- manifests/grafana/grafana-dashboards.yaml | 18 ++-- manifests/grafana/grafana-deployment.yaml | 10 +-- 7 files changed, 243 insertions(+), 25 deletions(-) create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 1173a998..91bcd428 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -16,8 +16,12 @@ set -u # Based on a configurable size limit, the tool will create 1 or N configmaps # to allocate the .json resources (bin packing) +# Update: 20170914 +# The tool also generates a grafana deployment manifest (-g option) + # parameters # -o, --output-file +# -g, --grafana-manifest-file # -i, --input-dir # -s, --size-limit # -x, --apply-configmap : true or false (default = false) @@ -66,15 +70,18 @@ DATE_EXEC="$(date "+%Y-%m-%d-%H%M%S")" BIN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" TOOL_HOME="$(dirname $BIN_DIR)" SCRIPT_BASE=`basename $0 | sed "s/\.[Ss][Hh]//"` +CONFIGMAP_DASHBOARD_PREFIX="grafana-dashboards" TEMPLATES_DIR="$TOOL_HOME/templates" DASHBOARD_HEADER_FILE="$TEMPLATES_DIR/dashboard.header" DASHBOARD_FOOT_FILE="$TEMPLATES_DIR/dashboard.foot" CONFIGMAP_HEADER="$TEMPLATES_DIR/ConfigMap.header" +GRAFANA_DEPLOYMENT_TEMPLATE="$TEMPLATES_DIR/grafana-deployment-template.yaml" OUTPUT_BASE_DIR="$TOOL_HOME/output" # Some default values OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-configMap-$DATE_EXEC.yaml" +GRAFANA_OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-deployment-$DATE_EXEC.yaml" DASHBOARDS_DIR="$TEMPLATES_DIR/grafana-dashboards" APPLY_CONFIGMAP="false" @@ -89,6 +96,10 @@ while (( "$#" )); do OUTPUT_FILE="$2" shift ;; + "-g" | "--grafana-output-file") + GRAFANA_OUTPUT_FILE="$2" + shift + ;; "-i" | "--input-dir") DASHBOARDS_DIR="$2" shift @@ -125,6 +136,26 @@ while (( "$#" )); do shift done +# +# Auxiliary Functions +# +indentMultiLineString() { + # Indent a given string (in one line including multiple \n) + test "$#" -eq 2 || { echo "INTERNAL ERROR: wrong call to function indentMultiLineString"; exit 1; } + local indent_number="$1" + local string="$2" + + test "$indent_number" -ge 0 || { echo "INTERNAL ERROR: wrong indent number parameter: $indent_number"; exit 1; } + + # prepare indentation text + local indent_string="" + for (( c=0; c<$indent_number; c++ )); do + indent_string="$indent_string " + done + + echo "$string" | sed -e "s#^#$indent_string#" -e "s#\\\n#\\\n$indent_string#g" +} + # # Main Functions # @@ -135,7 +166,7 @@ addConfigMapHeader() { test "$#" -eq 1 && local id="$1" || local id="" if [ "$id" ]; then - cat "$CONFIGMAP_HEADER" | sed "s/name: grafana-dashboards/name: grafana-dashboards-$id/" + cat "$CONFIGMAP_HEADER" | sed "s/name: $CONFIGMAP_DASHBOARD_PREFIX/name: $CONFIGMAP_DASHBOARD_PREFIX-$id/" else cat "$CONFIGMAP_HEADER" fi @@ -247,12 +278,23 @@ bin-pack-files() { IFS=$OLDIFS } +# prepareGrafanaDeploymentManifest() { +# local num_configmaps="$1" +# +# for (( i=0; i<$total_configmaps_created; i++ )); do +# echo "Creating deployment for $CONFIGMAP_DASHBOARD_PREFIX-$i" +# +# done +# } + + # Some variables checks... test ! -d "$TEMPLATES_DIR" && { echo "ERROR: missing templates directory $TEMPLATES_DIR"; exit 1; } test -f "$DASHBOARD_FOOT_FILE" || { echo "Template $DASHBOARD_FOOT_FILE not found"; exit 1; } test -f "$DASHBOARD_HEADER_FILE" || { echo "Template $DASHBOARD_HEADER_FILE not found"; exit 1; } test -f "$CONFIGMAP_HEADER" || { echo "Template $CONFIGMAP_HEADER not found"; exit 1; } +test -f "$GRAFANA_DEPLOYMENT_TEMPLATE" || { echo "Template $GRAFANA_DEPLOYMENT_TEMPLATE not found"; exit 1; } test ! -d "$OUTPUT_BASE_DIR" && { echo "ERROR: missing directory $OUTPUT_BASE_DIR"; exit 1; } @@ -260,7 +302,9 @@ test ! -d "$OUTPUT_BASE_DIR" && { echo "ERROR: missing directory $OUTPUT_BASE_DI test -d "$DASHBOARDS_DIR" || { echo "ERROR: Dashboards directory not found: $DASHBOARDS_DIR"; echoSyntax; exit 1; } test -f "$OUTPUT_FILE" && { echo "ERROR: Output file already exists: $OUTPUT_FILE"; exit 1; } +test -f "$GRAFANA_OUTPUT_FILE" && { echo "ERROR: Output file already exists: $GRAFANA_OUTPUT_FILE"; exit 1; } touch $OUTPUT_FILE || { echo "ERROR: Unable to create or modify $OUTPUT_FILE"; exit 1; } +touch $GRAFANA_OUTPUT_FILE || { echo "ERROR: Unable to create or modify $GRAFANA_OUTPUT_FILE"; exit 1; } # Main code start @@ -269,6 +313,8 @@ echo "# Configured size limit: $DATA_SIZE_LIMIT bytes" echo "# Grafna input dashboards and datasources will be read from: $DASHBOARDS_DIR" echo "# Grafana Dashboards ConfigMap will be created into file:" echo "$OUTPUT_FILE" +echo "# Grafana Deployment manifest will be created into file:" +echo "$GRAFANA_OUTPUT_FILE" echo # Loop variables initialization @@ -286,7 +332,7 @@ if [ "$to_process" ]; then echo echo "# Size limit not reached ($bytes_to_process). Adding all files into basic configmap" echo - addConfigMapHeader >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } + addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } else echo echo "# Size limit not reached ($bytes_to_process). Adding remaining files into configmap with id $n" @@ -302,24 +348,50 @@ echo "# Process completed, configmap created: $(basename $OUTPUT_FILE)" echo "# Summary" echo "# Total files processed: $total_files_processed" echo "# Total amount of ConfigMaps inside the manifest: $total_configmaps_created" +echo +# Grafana deployment Processing (for every configmap) +#prepareGrafanaDeploymentManifest "$total_configmaps_created" +VOLUMES="" +VOLUME_MOUNTS="" +WATCH_DIR="" +for (( i=0; i<$total_configmaps_created; i++ )); do + configmap="$CONFIGMAP_DASHBOARD_PREFIX-$i" + echo "# Preparing grafana deployment to support configmap: $configmap" + + test "$VOLUME_MOUNTS" && VOLUME_MOUNTS="$VOLUME_MOUNTS\n- name: $configmap\n mountPath: /var/$configmap" || VOLUME_MOUNTS="- name: $configmap\n mountPath: /var/$configmap" + test "$VOLUMES" && VOLUMES="$VOLUMES\n- name: $configmap\n configMap:\n name: $configmap" || VOLUMES="- name: $configmap\n configMap:\n name: $configmap" + test "$WATCH_DIR" && WATCH_DIR="$WATCH_DIR\n- '--watch-dir=/var/$configmap'" || WATCH_DIR="- '--watch-dir=/var/$configmap'" + # echo "DEBUG:" + # echo "VOLUMES: $VOLUMES" + # echo "VOLUME_MOUNTS: $VOLUME_MOUNTS" + # echo "WATCH_DIR: $WATCH_DIR" + echo +done + +echo "# Processing grafana deployment template into $GRAFANA_OUTPUT_FILE" +sed -e "s#XXX_VOLUMES_XXX#$(indentMultiLineString 6 "$VOLUMES")#" \ + -e "s#XXX_VOLUME_MOUNTS_XXX#$(indentMultiLineString 8 "$VOLUME_MOUNTS")#" \ + -e "s#XXX_WATCH_DIR_XXX#$(indentMultiLineString 10 "$WATCH_DIR")#" \ + $GRAFANA_DEPLOYMENT_TEMPLATE > $GRAFANA_OUTPUT_FILE # If output file is empty we can delete it and exit test ! -s "$OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $OUTPUT_FILE; exit 0; } +test ! -s "$GRAFANA_OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $GRAFANA_OUTPUT_FILE; exit 0; } if [ "$APPLY_CONFIGMAP" = "true" ]; then test -x "$(which kubectl)" || { echo "ERROR: kubectl command not available. Apply configmap not possible"; exit 1; } - echo + echo "# Applying configuration with $APPLY_TYPE method on namespace $NAMESPACE" if kubectl -n $NAMESPACE $APPLY_TYPE -f "$OUTPUT_FILE"; then echo - echo "# ConfigMap updated. Wait until grafana-watcher applies the changes and reloads the dashboards." + echo "# ConfigMap updated. Updating grafana deployment" + kubectl -n $NAMESPACE $APPLY_TYPE -f "$GRAFANA_OUTPUT_FILE" || { echo "Error applying Grafana deployment. Check yaml file: $GRAFANA_OUTPUT_FILE"; exit 1; } else - echo - echo "ERROR APPLYING CONFIGURATION. Check yaml file" - echo "$OUTPUT_FILE" + echo "Error applying Configmap. Check yaml file: $OUTPUT_FILE" fi else echo echo "# To apply the new configMap to your k8s system do something like:" - echo "kubectl -n monitoring $APPLY_TYPE -f $(basename $OUTPUT_FILE)" + echo "kubectl -n monitoring $APPLY_TYPE -f $OUTPUT_FILE" + echo "kubectl -n monitoring $APPLY_TYPE -f $GRAFANA_OUTPUT_FILE" echo fi diff --git a/hack/grafana-dashboards-configmap-generator/output/README.md b/hack/grafana-dashboards-configmap-generator/output/README.md index 47e06b89..d2e3c8a6 100644 --- a/hack/grafana-dashboards-configmap-generator/output/README.md +++ b/hack/grafana-dashboards-configmap-generator/output/README.md @@ -1,2 +1 @@ -### By default, this directory will include all generated manifests (if -o is not used) -### With -o, --output-file option we can force the tool to generate a file wherever we want +### This directory will include all generated manifests if no specific options are given diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml new file mode 100644 index 00000000..1666deae --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:4.4.1 + env: + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + volumeMounts: + - name: grafana-storage + mountPath: /var/grafana-storage + ports: + - name: web + containerPort: 3000 + resources: + requests: + memory: 100Mi + cpu: 100m + limits: + memory: 200Mi + cpu: 200m + - name: grafana-watcher + image: quay.io/coreos/grafana-watcher:v0.0.8 + args: + - '--watch-dir=/var/grafana-dashboards' + - '--grafana-url=http://localhost:3000' + env: + - name: GRAFANA_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GRAFANA_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + resources: + requests: + memory: "16Mi" + cpu: "50m" + limits: + memory: "32Mi" + cpu: "100m" + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards + volumes: + - name: grafana-storage + emptyDir: {} + - name: grafana-dashboards + configMap: + name: grafana-dashboards diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml new file mode 100644 index 00000000..844ecbc7 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:4.4.1 + env: + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + volumeMounts: + - name: grafana-storage + mountPath: /var/grafana-storage + ports: + - name: web + containerPort: 3000 + resources: + requests: + memory: 100Mi + cpu: 100m + limits: + memory: 200Mi + cpu: 200m + - name: grafana-watcher + image: quay.io/coreos/grafana-watcher:v0.0.8 + args: +XXX_WATCH_DIR_XXX + - '--grafana-url=http://localhost:3000' + env: + - name: GRAFANA_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: user + - name: GRAFANA_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + resources: + requests: + memory: "16Mi" + cpu: "50m" + limits: + memory: "32Mi" + cpu: "100m" + volumeMounts: +XXX_VOLUME_MOUNTS_XXX + volumes: + - name: grafana-storage + emptyDir: {} +XXX_VOLUMES_XXX diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index 64287dae..bb0c0685 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -10,8 +10,10 @@ hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-r # Max Size per ConfigMap: 240000 # Input dir: assets/grafana # output file: manifests/grafana/grafana-dashboards.yaml +# grafana deployment output file: manifests/grafana/grafana-deployment.yaml test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml -hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana -o manifests/grafana/grafana-dashboards.yaml +test -f manifests/grafana/grafana-deployment.yaml && rm -f manifests/grafana/grafana-deployment.yaml +hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana -o manifests/grafana/grafana-dashboards.yaml -g manifests/grafana/grafana-deployment.yaml # Generate Grafana Credentials Secret hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 38dfca63..36ac0373 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboards + name: grafana-dashboards-0 data: deployment-dashboard.json: |+ { @@ -821,7 +821,7 @@ data: "timezone": "browser", "title": "Deployment", "version": 1 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -1882,7 +1882,7 @@ data: "timezone": "browser", "title": "Kubernetes Capacity Planing", "version": 4 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -2628,7 +2628,7 @@ data: "timezone": "", "title": "Kubernetes Cluster Health", "version": 12 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -3537,7 +3537,7 @@ data: "timezone": "", "title": "Kubernetes Cluster Status", "version": 22 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -4213,7 +4213,7 @@ data: "timezone": "", "title": "Kubernetes Control Plane Status", "version": 11 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -4660,7 +4660,7 @@ data: "timezone": "browser", "title": "Kubernetes Resource Requests", "version": 2 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -5565,7 +5565,7 @@ data: "timezone": "browser", "title": "Nodes", "version": 2 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", @@ -6010,7 +6010,7 @@ data: "timezone": "browser", "title": "Pods", "version": 1 - }, + } , "inputs": [ { "name": "DS_PROMETHEUS", diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 1666deae..a75724e5 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -43,7 +43,7 @@ spec: - name: grafana-watcher image: quay.io/coreos/grafana-watcher:v0.0.8 args: - - '--watch-dir=/var/grafana-dashboards' + - '--watch-dir=/var/grafana-dashboards-0' - '--grafana-url=http://localhost:3000' env: - name: GRAFANA_USER @@ -64,11 +64,11 @@ spec: memory: "32Mi" cpu: "100m" volumeMounts: - - name: grafana-dashboards - mountPath: /var/grafana-dashboards + - name: grafana-dashboards-0 + mountPath: /var/grafana-dashboards-0 volumes: - name: grafana-storage emptyDir: {} - - name: grafana-dashboards + - name: grafana-dashboards-0 configMap: - name: grafana-dashboards + name: grafana-dashboards-0 From 3c4dec7a1cec4254203b435ceb900b2113079beb Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Fri, 15 Sep 2017 11:43:13 +0200 Subject: [PATCH 120/638] grafana-deployment-original file deleted --- .../grafana-deployment-original.yaml | 74 ------------------- 1 file changed, 74 deletions(-) delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml deleted file mode 100644 index 1666deae..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-original.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: apps/v1beta1 -kind: Deployment -metadata: - name: grafana -spec: - replicas: 1 - template: - metadata: - labels: - app: grafana - spec: - containers: - - name: grafana - image: grafana/grafana:4.4.1 - env: - - name: GF_AUTH_BASIC_ENABLED - value: "true" - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "true" - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password - volumeMounts: - - name: grafana-storage - mountPath: /var/grafana-storage - ports: - - name: web - containerPort: 3000 - resources: - requests: - memory: 100Mi - cpu: 100m - limits: - memory: 200Mi - cpu: 200m - - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.8 - args: - - '--watch-dir=/var/grafana-dashboards' - - '--grafana-url=http://localhost:3000' - env: - - name: GRAFANA_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GRAFANA_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password - resources: - requests: - memory: "16Mi" - cpu: "50m" - limits: - memory: "32Mi" - cpu: "100m" - volumeMounts: - - name: grafana-dashboards - mountPath: /var/grafana-dashboards - volumes: - - name: grafana-storage - emptyDir: {} - - name: grafana-dashboards - configMap: - name: grafana-dashboards From c8cb2df928bc0c4975eaba8db7158080ef731671 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 18 Sep 2017 11:11:30 +0200 Subject: [PATCH 121/638] kube-prometheus: exclude pod log subresource from latency alerts --- assets/prometheus/rules/kube-apiserver.rules | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index a7fdfddc..04b4a6de 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -16,7 +16,7 @@ ALERT K8SApiserverDown ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) ) / 1e6 > 1.0 FOR 10m LABELS { diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 71282c51..de3d7787 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -304,7 +304,7 @@ data: ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) ) / 1e6 > 1.0 FOR 10m LABELS { From 3cbffd9ed63d68f2320a29c2b11ddd2e5ac7db76 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 21 Sep 2017 14:23:45 +0200 Subject: [PATCH 122/638] *: bump version to v0.13.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 5f6d02ea..cc3bf5de 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.12.0 + image: quay.io/coreos/prometheus-operator:v0.13.0 name: prometheus-operator ports: - containerPort: 8080 From 621396431760e586207adc966191820e0849b888 Mon Sep 17 00:00:00 2001 From: Daniel Sachse Date: Thu, 21 Sep 2017 17:08:01 +0200 Subject: [PATCH 123/638] Updated to prometheus 1.7.1 This updates prometheus to 1.7.1 where it was missing --- docs/Monitoring external etcd.md | 2 +- manifests/examples/example-app/prometheus-frontend.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Monitoring external etcd.md b/docs/Monitoring external etcd.md index 65dfe139..6812e404 100644 --- a/docs/Monitoring external etcd.md +++ b/docs/Monitoring external etcd.md @@ -35,7 +35,7 @@ spec: replicas: 2 secrets: - etcd-certs - version: v1.7.0 + version: v1.7.1 ``` If your Prometheus Operator is already in place, update it: diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index f0341d51..b55b58db 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.7.0 + version: v1.7.1 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 421c024d..d6e5a52c 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.7.0 + version: v1.7.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 6aadd5cfd95466c85f14ad027f03ca711ac1cc34 Mon Sep 17 00:00:00 2001 From: Giap Tran Date: Fri, 22 Sep 2017 12:00:43 +0700 Subject: [PATCH 124/638] grafana_dashboards_generate: fix "file" local variable --- .../bin/grafana_dashboards_generate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 91bcd428..93e91801 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -174,7 +174,7 @@ addConfigMapHeader() { addArrayToConfigMap() { # This function process the array to_process into a configmap - + local file="" local OLDIFS=$IFS local IFS=$'\n' for file in ${to_process[@]}; do From 40fa4ccd158355d81c8e38773323aa59b221eb9e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 26 Sep 2017 15:59:44 +0200 Subject: [PATCH 125/638] grafana-dashboards: various small improvements --- ...bernetes-capacity-planning-dashboard.json} | 2 +- .../kubernetes-cluster-health-dashboard.json | 16 ++++---- .../kubernetes-cluster-status-dashboard.json | 12 +++--- ...rnetes-control-plane-status-dashboard.json | 6 +-- manifests/grafana/grafana-dashboards.yaml | 38 +++++++++---------- 5 files changed, 37 insertions(+), 37 deletions(-) rename assets/grafana/{kubernetes-capacity-planing-dashboard.json => kubernetes-capacity-planning-dashboard.json} (99%) diff --git a/assets/grafana/kubernetes-capacity-planing-dashboard.json b/assets/grafana/kubernetes-capacity-planning-dashboard.json similarity index 99% rename from assets/grafana/kubernetes-capacity-planing-dashboard.json rename to assets/grafana/kubernetes-capacity-planning-dashboard.json index 81336453..3ea15947 100644 --- a/assets/grafana/kubernetes-capacity-planing-dashboard.json +++ b/assets/grafana/kubernetes-capacity-planning-dashboard.json @@ -1043,6 +1043,6 @@ ] }, "timezone": "browser", - "title": "Kubernetes Capacity Planing", + "title": "Kubernetes Capacity Planning", "version": 4 } \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-health-dashboard.json b/assets/grafana/kubernetes-cluster-health-dashboard.json index d04ff0c0..46eb6ca7 100644 --- a/assets/grafana/kubernetes-cluster-health-dashboard.json +++ b/assets/grafana/kubernetes-cluster-health-dashboard.json @@ -182,7 +182,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -197,7 +197,7 @@ "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -260,7 +260,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"pending\"})", + "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -275,7 +275,7 @@ "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -428,7 +428,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_ready{condition!=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -506,7 +506,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_disk_pressure{condition=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -584,7 +584,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_memory_pressure{condition=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -729,5 +729,5 @@ }, "timezone": "", "title": "Kubernetes Cluster Health", - "version": 12 + "version": 9 } \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-status-dashboard.json b/assets/grafana/kubernetes-cluster-status-dashboard.json index 1b6d5b35..d30906bd 100644 --- a/assets/grafana/kubernetes-cluster-status-dashboard.json +++ b/assets/grafana/kubernetes-cluster-status-dashboard.json @@ -176,7 +176,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -191,7 +191,7 @@ "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -266,7 +266,7 @@ "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -345,7 +345,7 @@ "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -424,7 +424,7 @@ "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -892,5 +892,5 @@ }, "timezone": "", "title": "Kubernetes Cluster Status", - "version": 22 + "version": 3 } \ No newline at end of file diff --git a/assets/grafana/kubernetes-control-plane-status-dashboard.json b/assets/grafana/kubernetes-control-plane-status-dashboard.json index 03b547ac..93508313 100644 --- a/assets/grafana/kubernetes-control-plane-status-dashboard.json +++ b/assets/grafana/kubernetes-control-plane-status-dashboard.json @@ -333,7 +333,7 @@ "tableColumn": "", "targets": [ { - "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -348,7 +348,7 @@ "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -659,5 +659,5 @@ }, "timezone": "", "title": "Kubernetes Control Plane Status", - "version": 11 + "version": 3 } \ No newline at end of file diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 36ac0373..6aec1802 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -832,7 +832,7 @@ data: ], "overwrite": true } - kubernetes-capacity-planing-dashboard.json: |+ + kubernetes-capacity-planning-dashboard.json: |+ { "dashboard": { @@ -1880,7 +1880,7 @@ data: ] }, "timezone": "browser", - "title": "Kubernetes Capacity Planing", + "title": "Kubernetes Capacity Planning", "version": 4 } , "inputs": [ @@ -2080,7 +2080,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2095,7 +2095,7 @@ data: "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -2158,7 +2158,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"pending\"})", + "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2173,7 +2173,7 @@ data: "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -2326,7 +2326,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_ready{condition!=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2404,7 +2404,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_disk_pressure{condition=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2482,7 +2482,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(kube_node_status_memory_pressure{condition=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2627,7 +2627,7 @@ data: }, "timezone": "", "title": "Kubernetes Cluster Health", - "version": 12 + "version": 9 } , "inputs": [ { @@ -2820,7 +2820,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -2835,7 +2835,7 @@ data: "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -2910,7 +2910,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -2989,7 +2989,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -3068,7 +3068,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -3536,7 +3536,7 @@ data: }, "timezone": "", "title": "Kubernetes Cluster Status", - "version": 22 + "version": 3 } , "inputs": [ { @@ -3886,7 +3886,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -3901,7 +3901,7 @@ data: "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], @@ -4212,7 +4212,7 @@ data: }, "timezone": "", "title": "Kubernetes Control Plane Status", - "version": 11 + "version": 3 } , "inputs": [ { From f8c48a976aafe71fb7d5c811216907dc548ce476 Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Fri, 29 Sep 2017 14:41:01 +0200 Subject: [PATCH 126/638] bump kube-state-metrics to v1.0.1 + fix rbac metric --- docs/Monitoring external etcd.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Monitoring external etcd.md b/docs/Monitoring external etcd.md index 6812e404..15e5632e 100644 --- a/docs/Monitoring external etcd.md +++ b/docs/Monitoring external etcd.md @@ -25,7 +25,7 @@ Edit prometheus-operator/contrib/kube-prometheus/manifests/prometheus/prometheus The manifest will look like that: ``` -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: k8s @@ -87,7 +87,7 @@ subsets: port: 2379 protocol: TCP --- -apiVersion: monitoring.coreos.com/v1alpha1 +apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: etcd-k8s From 710110603b0dc5a129eb1016e160107068f37ef3 Mon Sep 17 00:00:00 2001 From: Alexander Brandstedt Date: Fri, 6 Oct 2017 12:15:46 +0200 Subject: [PATCH 127/638] change make generate to use docker run to be more protable --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 04bd205a..79411408 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,4 @@ +BUILDER := docker run --rm -it --workdir /data -v ${PWD}:/data debian:8 ./hack/scripts/generate-manifests.sh generate: @echo ">> Compiling assets and generating Kubernetes manifests" - @hack/scripts/generate-manifests.sh + $(BUILDER) From 461c01b0638f063b2cd36f1202e0ed0019c62cf3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 6 Oct 2017 14:02:20 +0200 Subject: [PATCH 128/638] kube-prometheus: various version updates --- manifests/alertmanager/alertmanager.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- manifests/node-exporter/node-exporter-daemonset.yaml | 6 +++--- manifests/prometheus/prometheus-k8s.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index bba9ff2e..22259ef4 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: main spec: replicas: 3 - version: v0.7.1 + version: v0.9.1 diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a75724e5..4907dceb 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.4.1 + image: grafana/grafana:4.5.2 env: - name: GF_AUTH_BASIC_ENABLED value: "true" diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index b7696694..2446fee4 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -12,10 +12,10 @@ spec: hostNetwork: true hostPID: true containers: - - image: quay.io/prometheus/node-exporter:v0.14.0 + - image: quay.io/prometheus/node-exporter:v0.15.0 args: - - "-collector.procfs=/host/proc" - - "-collector.sysfs=/host/sys" + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" name: node-exporter ports: - containerPort: 9100 diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index d6e5a52c..e936de46 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.7.1 + version: v1.7.2 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 1a303998837fb78c7f00e92192a650d7ed04ab43 Mon Sep 17 00:00:00 2001 From: jordanjennings Date: Wed, 11 Oct 2017 15:09:11 -0400 Subject: [PATCH 129/638] [kube-prometheus] Update to Grafana 4.5.2 in template In the generated version of this file at /contrib/kube-prometheus/manifests/grafana/grafana-deployment.yaml the version was bumped to 4.5.2 but seems it was overlooked in this templated version of the file. --- .../templates/grafana-deployment-template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index 844ecbc7..b6b0f7ec 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.4.1 + image: grafana/grafana:4.5.2 env: - name: GF_AUTH_BASIC_ENABLED value: "true" From 7eed244db7ad9fb4386040e8aca6db2032a36f45 Mon Sep 17 00:00:00 2001 From: Valdis Rigdon Date: Fri, 13 Oct 2017 11:55:56 -0400 Subject: [PATCH 130/638] Fix merging of -datasource.json files The merge was missing a new line at the end of the merge, so only the first datasource was being put into the ConfigMap. --- .../bin/grafana_dashboards_generate.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 93e91801..4a328959 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -199,6 +199,7 @@ addArrayToConfigMap() { # Dashboard foot test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE + [ "$(tail -c 1 "$file")" ] && echo done echo "---" From 6ed84502c8ad8262a2dcb0088ba24f65bf22f091 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 16 Oct 2017 14:40:29 +0200 Subject: [PATCH 131/638] kube-prometheus: fix multiple series error in grafana dashboard --- assets/grafana/kubernetes-control-plane-status-dashboard.json | 2 +- manifests/grafana/grafana-dashboards.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/grafana/kubernetes-control-plane-status-dashboard.json b/assets/grafana/kubernetes-control-plane-status-dashboard.json index 93508313..47721922 100644 --- a/assets/grafana/kubernetes-control-plane-status-dashboard.json +++ b/assets/grafana/kubernetes-control-plane-status-dashboard.json @@ -333,7 +333,7 @@ "tableColumn": "", "targets": [ { - "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "", diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 6aec1802..13dc4e64 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -3886,7 +3886,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "topk(1, (sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100)", + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From dea42fe5caa0f7d5fff305283b9a2bb1cc601b0e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 16 Oct 2017 16:58:46 +0200 Subject: [PATCH 132/638] *: ensure using crdgroup flag in monitoring client --- manifests/node-exporter/node-exporter-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index 2446fee4..d98deee6 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -12,7 +12,7 @@ spec: hostNetwork: true hostPID: true containers: - - image: quay.io/prometheus/node-exporter:v0.15.0 + - image: quay.io/prometheus/node-exporter:v0.15.0 args: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" From 1b7c8cdf2199b4f6f070a280b6aa7248772dd291 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 16 Oct 2017 15:11:53 +0200 Subject: [PATCH 133/638] *: bump Prometheus to v2.0.0-rc.1 --- assets/prometheus/rules/alertmanager.rules | 36 - .../prometheus/rules/alertmanager.rules.yaml | 33 + assets/prometheus/rules/etcd3.rules | 177 --- assets/prometheus/rules/etcd3.rules.yaml | 123 ++ assets/prometheus/rules/general.rules | 63 - assets/prometheus/rules/general.rules.yaml | 48 + assets/prometheus/rules/kube-apiserver.rules | 28 - .../rules/kube-apiserver.rules.yaml | 22 + .../rules/kube-controller-manager.rules | 11 - .../rules/kube-controller-manager.rules.yaml | 13 + assets/prometheus/rules/kube-scheduler.rules | 11 - .../rules/kube-scheduler.rules.yaml | 13 + assets/prometheus/rules/kubelet.rules | 60 - assets/prometheus/rules/kubelet.rules.yaml | 49 + assets/prometheus/rules/kubernetes.rules | 171 --- assets/prometheus/rules/kubernetes.rules.yaml | 115 ++ assets/prometheus/rules/node.rules | 43 - assets/prometheus/rules/node.rules.yaml | 37 + assets/prometheus/rules/prometheus.rules | 10 - assets/prometheus/rules/prometheus.rules.yaml | 12 + hack/scripts/generate-rules-configmap.sh | 2 +- .../prometheus/prometheus-k8s-rules.yaml | 1095 +++++++---------- manifests/prometheus/prometheus-k8s.yaml | 2 +- 23 files changed, 942 insertions(+), 1232 deletions(-) delete mode 100644 assets/prometheus/rules/alertmanager.rules create mode 100644 assets/prometheus/rules/alertmanager.rules.yaml delete mode 100644 assets/prometheus/rules/etcd3.rules create mode 100644 assets/prometheus/rules/etcd3.rules.yaml delete mode 100644 assets/prometheus/rules/general.rules create mode 100644 assets/prometheus/rules/general.rules.yaml delete mode 100644 assets/prometheus/rules/kube-apiserver.rules create mode 100644 assets/prometheus/rules/kube-apiserver.rules.yaml delete mode 100644 assets/prometheus/rules/kube-controller-manager.rules create mode 100644 assets/prometheus/rules/kube-controller-manager.rules.yaml delete mode 100644 assets/prometheus/rules/kube-scheduler.rules create mode 100644 assets/prometheus/rules/kube-scheduler.rules.yaml delete mode 100644 assets/prometheus/rules/kubelet.rules create mode 100644 assets/prometheus/rules/kubelet.rules.yaml delete mode 100644 assets/prometheus/rules/kubernetes.rules create mode 100644 assets/prometheus/rules/kubernetes.rules.yaml delete mode 100644 assets/prometheus/rules/node.rules create mode 100644 assets/prometheus/rules/node.rules.yaml delete mode 100644 assets/prometheus/rules/prometheus.rules create mode 100644 assets/prometheus/rules/prometheus.rules.yaml diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules deleted file mode 100644 index 71bdc687..00000000 --- a/assets/prometheus/rules/alertmanager.rules +++ /dev/null @@ -1,36 +0,0 @@ -ALERT AlertmanagerConfigInconsistent - IF count_values by (service) ("config_hash", alertmanager_config_hash) - / on(service) group_left - label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Alertmanager configurations are inconsistent", - description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." - } - -ALERT AlertmanagerDownOrMissing - IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") - / on(job) group_right - sum by(job) (up) != 1 - FOR 5m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager down or not discovered", - description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." - } - -ALERT FailedReload - IF alertmanager_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager configuration reload has failed", - description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml new file mode 100644 index 00000000..8f65c5da --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -0,0 +1,33 @@ +groups: +- name: ./alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules deleted file mode 100644 index 1b1621e4..00000000 --- a/assets/prometheus/rules/etcd3.rules +++ /dev/null @@ -1,177 +0,0 @@ -# general cluster availability - -# alert if another failed member will result in an unavailable cluster -ALERT InsufficientMembers -IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) -FOR 3m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "etcd cluster insufficient members", - description = "If one more etcd member goes down the cluster will be unavailable", -} - -# etcd leader alerts -# ================== - -# alert if any etcd instance has no leader -ALERT NoLeader -IF etcd_server_has_leader{job="etcd"} == 0 -FOR 1m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "etcd member has no leader", - description = "etcd member {{ $labels.instance }} has no leader", -} - -# alert if there are lots of leader changes -ALERT HighNumberOfLeaderChanges -IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of leader changes within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", -} - -# gRPC request alerts -# =================== - -# alert if more than 1% of gRPC method calls have failed within the last 5 minutes -ALERT HighNumberOfFailedGRPCRequests -IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if more than 5% of gRPC method calls have failed within the last 5 minutes -ALERT HighNumberOfFailedGRPCRequests -IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 -FOR 5m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if the 99th percentile of gRPC method calls take more than 150ms -ALERT GRPCRequestsSlow -IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", -} - -# HTTP requests alerts -# ==================== - -# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes -ALERT HighNumberOfFailedHTTPRequests -IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes -ALERT HighNumberOfFailedHTTPRequests -IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 -FOR 5m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow -IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", -} - -# etcd member communication alerts -# ================================ - -# alert if 99th percentile of round trips take 150ms -ALERT EtcdMemberCommunicationSlow -IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", -} - -# etcd proposal alerts -# ==================== - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals -IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of proposals within the etcd cluster are failing", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", -} - -# etcd disk io latency alerts -# =========================== - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations -IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "high fsync durations", - description = "etcd instance {{ $labels.instance }} fync durations are high", -} - -# alert if 99th percentile of commit durations is higher than 250ms -ALERT HighCommitDurations -IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "high commit durations", - description = "etcd instance {{ $labels.instance }} commit durations are high", -} diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml new file mode 100644 index 00000000..ade2ed62 --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules.yaml @@ -0,0 +1,123 @@ +groups: +- name: ./etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules deleted file mode 100644 index 3500d689..00000000 --- a/assets/prometheus/rules/general.rules +++ /dev/null @@ -1,63 +0,0 @@ -### Up Alerting ### - -Alert TargetDown - IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Targets are down", - description = "{{ $value }}% or more of {{ $labels.job }} targets are down." - } - -### Dead man's switch ### - -ALERT DeadMansSwitch - IF vector(1) - LABELS { - severity = "none", - } - ANNOTATIONS { - summary = "Alerting DeadMansSwitch", - description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", - } - -### File descriptor alerts ### - -ALERT TooManyOpenFileDescriptors - IF 100 * (process_open_fds / process_max_fds) > 95 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", - } - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml new file mode 100644 index 00000000..355e12f3 --- /dev/null +++ b/assets/prometheus/rules/general.rules.yaml @@ -0,0 +1,48 @@ +groups: +- name: ./general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + - alert: TooManyOpenFileDescriptors + expr: 100 * (process_open_fds / process_max_fds) > 95 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' + summary: too many open file descriptors + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules deleted file mode 100644 index 04b4a6de..00000000 --- a/assets/prometheus/rules/kube-apiserver.rules +++ /dev/null @@ -1,28 +0,0 @@ -ALERT K8SApiserverDown - IF absent(up{job="apiserver"} == 1) - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -# -# apiserver_request_latencies' unit is microseconds -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml new file mode 100644 index 00000000..55ebe025 --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules.yaml @@ -0,0 +1,22 @@ +groups: +- name: ./kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have + disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the + kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules deleted file mode 100644 index 3157cd12..00000000 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ /dev/null @@ -1,11 +0,0 @@ -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", - } diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml new file mode 100644 index 00000000..f23bbde3 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: ./kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules deleted file mode 100644 index ee86017a..00000000 --- a/assets/prometheus/rules/kube-scheduler.rules +++ /dev/null @@ -1,11 +0,0 @@ -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", - } diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml new file mode 100644 index 00000000..0383b3b1 --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: ./kube-scheduler.rules + rules: + - alert: K8SSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S scheduler. New pods are not being assigned + to nodes. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler + summary: Scheduler is down diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules deleted file mode 100644 index 0d47d9d7..00000000 --- a/assets/prometheus/rules/kubelet.rules +++ /dev/null @@ -1,60 +0,0 @@ -ALERT K8SNodeNotReady - IF kube_node_status_condition{condition="Ready", status="true"} == 0 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 - AND - ( - count(kube_node_status_condition{condition="Ready", status="true"} == 0) - / - count(kube_node_status_condition{condition="Ready", status="true"}) - ) > 0.2 - FOR 1m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubernetes nodes are Not Ready", - description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", - } - -ALERT K8SKubeletDown - IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml new file mode 100644 index 00000000..1aa5f84c --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -0,0 +1,49 @@ +groups: +- name: ./kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) + > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady + state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) + > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules deleted file mode 100644 index 084d11e5..00000000 --- a/assets/prometheus/rules/kubernetes.rules +++ /dev/null @@ -1,171 +0,0 @@ -# NOTE: These rules were kindly contributed by the SoundCloud engineering team. - -### Container resources ### - -cluster_namespace_controller_pod_container:spec_memory_limit_bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:spec_cpu_shares = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_cpu_shares{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:cpu_usage:rate = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - irate( - container_cpu_usage_seconds_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_working_set:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_rss:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_rss{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_cache:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_cache{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:disk_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_disk_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_pagefaults:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failures_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_oom:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failcnt{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -### Cluster resources ### - -cluster:memory_allocation:percent = - 100 * sum by (cluster) ( - container_spec_memory_limit_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - -cluster:memory_used:percent = - 100 * sum by (cluster) ( - container_memory_usage_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - -cluster:cpu_allocation:percent = - 100 * sum by (cluster) ( - container_spec_cpu_shares{pod_name!=""} - ) / sum by (cluster) ( - container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores - ) - -cluster:node_cpu_use:percent = - 100 * sum by (cluster) ( - rate(node_cpu{mode!="idle"}[5m]) - ) / sum by (cluster) ( - machine_cpu_cores - ) - -### API latency ### - -# Raw metrics are in microseconds. Convert to seconds. -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile( - 0.99, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile( - 0.9, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile( - 0.5, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - -### Scheduling latency ### - -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml new file mode 100644 index 00000000..ab5ccf06 --- /dev/null +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -0,0 +1,115 @@ +groups: +- name: ./kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) + / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} + * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) + BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules deleted file mode 100644 index 4f768671..00000000 --- a/assets/prometheus/rules/node.rules +++ /dev/null @@ -1,43 +0,0 @@ -ALERT NodeExporterDown - IF absent(up{job="node-exporter"} == 1) - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", - } - -ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Node ran out of disk space.", - description = "{{ $labels.node }} has run out of disk space.", - } - -ALERT K8SNodeMemoryPressure - IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under memory pressure.", - description = "{{ $labels.node }} is under memory pressure.", - } - -ALERT K8SNodeDiskPressure - IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under disk pressure.", - description = "{{ $labels.node }} is under disk pressure.", - } diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml new file mode 100644 index 00000000..9c1641ca --- /dev/null +++ b/assets/prometheus/rules/node.rules.yaml @@ -0,0 +1,37 @@ +groups: +- name: ./node.rules + rules: + - alert: NodeExporterDown + expr: absent(up{job="node-exporter"} == 1) + for: 10m + labels: + severity: warning + annotations: + description: Prometheus could not scrape a node-exporter for more than 10m, + or node-exporters have disappeared from discovery. + summary: node-exporter cannot be scraped + - alert: K8SNodeOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + labels: + service: k8s + severity: critical + annotations: + description: '{{ $labels.node }} has run out of disk space.' + summary: Node ran out of disk space. + - alert: K8SNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under memory pressure.' + summary: Node is under memory pressure. + - alert: K8SNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under disk pressure.' + summary: Node is under disk pressure. diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules deleted file mode 100644 index 05c278f1..00000000 --- a/assets/prometheus/rules/prometheus.rules +++ /dev/null @@ -1,10 +0,0 @@ -ALERT FailedReload - IF prometheus_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Prometheus configuration reload has failed", - description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml new file mode 100644 index 00000000..6ed0cd68 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -0,0 +1,12 @@ +groups: +- name: ./prometheus.rules + rules: + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Prometheus configuration reload has failed diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index b8e00fef..9eb2efc8 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -11,7 +11,7 @@ metadata: data: EOF -for f in assets/prometheus/rules/*.rules +for f in assets/prometheus/rules/*.rules.yaml do echo " $(basename $f): |+" cat $f | sed "s/^/ /g" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index de3d7787..041c127b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,623 +6,478 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - alertmanager.rules: |+ - ALERT AlertmanagerConfigInconsistent - IF count_values by (service) ("config_hash", alertmanager_config_hash) - / on(service) group_left - label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Alertmanager configurations are inconsistent", - description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." - } - - ALERT AlertmanagerDownOrMissing - IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") - / on(job) group_right - sum by(job) (up) != 1 - FOR 5m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager down or not discovered", - description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." - } - - ALERT FailedReload - IF alertmanager_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager configuration reload has failed", - description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } - etcd3.rules: |+ - # general cluster availability - - # alert if another failed member will result in an unavailable cluster - ALERT InsufficientMembers - IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd cluster insufficient members", - description = "If one more etcd member goes down the cluster will be unavailable", - } - - # etcd leader alerts - # ================== - - # alert if any etcd instance has no leader - ALERT NoLeader - IF etcd_server_has_leader{job="etcd"} == 0 - FOR 1m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd member has no leader", - description = "etcd member {{ $labels.instance }} has no leader", - } - - # alert if there are lots of leader changes - ALERT HighNumberOfLeaderChanges - IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of leader changes within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", - } - - # gRPC request alerts - # =================== - - # alert if more than 1% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of gRPC method calls take more than 150ms - ALERT GRPCRequestsSlow - IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", - } - - # HTTP requests alerts - # ==================== - - # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", - } - - # etcd member communication alerts - # ================================ - - # alert if 99th percentile of round trips take 150ms - ALERT EtcdMemberCommunicationSlow - IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", - } - - # etcd proposal alerts - # ==================== - - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of proposals within the etcd cluster are failing", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - - # etcd disk io latency alerts - # =========================== - - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "etcd instance {{ $labels.instance }} fync durations are high", - } - - # alert if 99th percentile of commit durations is higher than 250ms - ALERT HighCommitDurations - IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high commit durations", - description = "etcd instance {{ $labels.instance }} commit durations are high", - } - general.rules: |+ - ### Up Alerting ### - - Alert TargetDown - IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Targets are down", - description = "{{ $value }}% or more of {{ $labels.job }} targets are down." - } - - ### Dead man's switch ### - - ALERT DeadMansSwitch - IF vector(1) - LABELS { - severity = "none", - } - ANNOTATIONS { - summary = "Alerting DeadMansSwitch", - description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", - } - - ### File descriptor alerts ### - - ALERT TooManyOpenFileDescriptors - IF 100 * (process_open_fds / process_max_fds) > 95 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", - } - - instance:fd_utilization = process_open_fds / process_max_fds - - # alert if file descriptors are likely to exhaust within the next 4 hours - ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - - # alert if file descriptors are likely to exhaust within the next hour - ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - kube-apiserver.rules: |+ - ALERT K8SApiserverDown - IF absent(up{job="apiserver"} == 1) - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - # - # apiserver_request_latencies' unit is microseconds - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - kube-controller-manager.rules: |+ - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", - } - kubelet.rules: |+ - ALERT K8SNodeNotReady - IF kube_node_status_condition{condition="Ready", status="true"} == 0 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 - AND - ( - count(kube_node_status_condition{condition="Ready", status="true"} == 0) - / - count(kube_node_status_condition{condition="Ready", status="true"}) - ) > 0.2 - FOR 1m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubernetes nodes are Not Ready", - description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", - } - - ALERT K8SKubeletDown - IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - kubernetes.rules: |+ - # NOTE: These rules were kindly contributed by the SoundCloud engineering team. - - ### Container resources ### - - cluster_namespace_controller_pod_container:spec_memory_limit_bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:spec_cpu_shares = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_cpu_shares{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:cpu_usage:rate = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - irate( - container_cpu_usage_seconds_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_working_set:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_rss:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_rss{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_cache:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_cache{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:disk_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_disk_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_pagefaults:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failures_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_oom:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failcnt{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - ### Cluster resources ### - - cluster:memory_allocation:percent = - 100 * sum by (cluster) ( - container_spec_memory_limit_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:memory_used:percent = - 100 * sum by (cluster) ( - container_memory_usage_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:cpu_allocation:percent = - 100 * sum by (cluster) ( - container_spec_cpu_shares{pod_name!=""} - ) / sum by (cluster) ( - container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores - ) - - cluster:node_cpu_use:percent = - 100 * sum by (cluster) ( - rate(node_cpu{mode!="idle"}[5m]) - ) / sum by (cluster) ( - machine_cpu_cores - ) - - ### API latency ### - - # Raw metrics are in microseconds. Convert to seconds. - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile( - 0.99, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile( - 0.9, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile( - 0.5, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - - ### Scheduling latency ### - - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - kube-scheduler.rules: |+ - ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", - } - node.rules: |+ - ALERT NodeExporterDown - IF absent(up{job="node-exporter"} == 1) - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", - } - - ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Node ran out of disk space.", - description = "{{ $labels.node }} has run out of disk space.", - } - - ALERT K8SNodeMemoryPressure - IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under memory pressure.", - description = "{{ $labels.node }} is under memory pressure.", - } - - ALERT K8SNodeDiskPressure - IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under disk pressure.", - description = "{{ $labels.node }} is under disk pressure.", - } - prometheus.rules: |+ - ALERT FailedReload - IF prometheus_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Prometheus configuration reload has failed", - description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } + alertmanager.rules.yaml: |+ + groups: + - name: ./alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed + etcd3.rules.yaml: |+ + groups: + - name: ./etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations + general.rules.yaml: |+ + groups: + - name: ./general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + - alert: TooManyOpenFileDescriptors + expr: 100 * (process_open_fds / process_max_fds) > 95 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' + summary: too many open file descriptors + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + kube-apiserver.rules.yaml: |+ + groups: + - name: ./kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have + disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the + kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high + kube-controller-manager.rules.yaml: |+ + groups: + - name: ./kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down + kube-scheduler.rules.yaml: |+ + groups: + - name: ./kube-scheduler.rules + rules: + - alert: K8SSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S scheduler. New pods are not being assigned + to nodes. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler + summary: Scheduler is down + kubelet.rules.yaml: |+ + groups: + - name: ./kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) + > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady + state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) + > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit + kubernetes.rules.yaml: |+ + groups: + - name: ./kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) + / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} + * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) + BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + node.rules.yaml: |+ + groups: + - name: ./node.rules + rules: + - alert: NodeExporterDown + expr: absent(up{job="node-exporter"} == 1) + for: 10m + labels: + severity: warning + annotations: + description: Prometheus could not scrape a node-exporter for more than 10m, + or node-exporters have disappeared from discovery. + summary: node-exporter cannot be scraped + - alert: K8SNodeOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + labels: + service: k8s + severity: critical + annotations: + description: '{{ $labels.node }} has run out of disk space.' + summary: Node ran out of disk space. + - alert: K8SNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under memory pressure.' + summary: Node is under memory pressure. + - alert: K8SNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under disk pressure.' + summary: Node is under disk pressure. + prometheus.rules.yaml: |+ + groups: + - name: ./prometheus.rules + rules: + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Prometheus configuration reload has failed diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index e936de46..168daa34 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.7.2 + version: v2.0.0-rc.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From ea91202eb3eab8875af3d6cbed6d44c839bb4a99 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 19 Oct 2017 11:28:34 +0200 Subject: [PATCH 134/638] *: bump version to v0.14.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index cc3bf5de..dc2a74cb 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.13.0 + image: quay.io/coreos/prometheus-operator:v0.14.0 name: prometheus-operator ports: - containerPort: 8080 From c510324253fc0ecbb54861354466b9beae5404c6 Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Date: Sun, 22 Oct 2017 11:20:57 +0200 Subject: [PATCH 135/638] newline bugfix modified in grafana configmap generator --- .../bin/grafana_dashboards_generate.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 4a328959..0a37c491 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -197,9 +197,11 @@ addArrayToConfigMap() { # File content: Indent 4 cat $file | sed "s/^/ /" + # If source file was not ended properly we add newline character + [ "$(tail -c 1 "$file")" ] && echo + # Dashboard foot test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE - [ "$(tail -c 1 "$file")" ] && echo done echo "---" From 8bf449907c3f748d6688b2d4f3cab3e3e2b1f3c4 Mon Sep 17 00:00:00 2001 From: xuchenhao001 Date: Wed, 25 Oct 2017 23:41:42 +0800 Subject: [PATCH 136/638] add manifests for kubeadm setup tool --- .../k8s/kubeadm/kube-controller-manager.yaml | 17 +++++++++++++++++ manifests/k8s/kubeadm/kube-scheduler.yaml | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 manifests/k8s/kubeadm/kube-controller-manager.yaml create mode 100644 manifests/k8s/kubeadm/kube-scheduler.yaml diff --git a/manifests/k8s/kubeadm/kube-controller-manager.yaml b/manifests/k8s/kubeadm/kube-controller-manager.yaml new file mode 100644 index 00000000..bd8d7cb5 --- /dev/null +++ b/manifests/k8s/kubeadm/kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: kube-system + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + selector: + component: kube-controller-manager + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP diff --git a/manifests/k8s/kubeadm/kube-scheduler.yaml b/manifests/k8s/kubeadm/kube-scheduler.yaml new file mode 100644 index 00000000..2d90097a --- /dev/null +++ b/manifests/k8s/kubeadm/kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: kube-system + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + selector: + component: kube-scheduler + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP From 73e3dffa72d87d37d0b07b61d992fa061f5ac854 Mon Sep 17 00:00:00 2001 From: Svend Sorensen Date: Mon, 30 Oct 2017 13:16:22 -0700 Subject: [PATCH 137/638] Omit static compute resource requests and limits from deployment Do not specify static resource settings in the kubernetes-deployment to avoid unnecessary replacement of pods when the manifest is reapplied (`kubectl apply`). The addon-resizer will dynamically set the pod compute resource values. If the values are also set statically in the deployment configuration, reapplying the configuration will result in the pods getting replaced. Without the static resource, the deployment configuration can be reapplied, and the pods will not be replaced. This change was also made in the upstream kube-state-metrics example manifests. https://github.com/kubernetes/kube-state-metrics/pull/285 --- .../kube-state-metrics/kube-state-metrics-deployment.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index bd313f21..ee8526d3 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -22,13 +22,6 @@ spec: port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 - resources: - requests: - memory: 100Mi - cpu: 100m - limits: - memory: 200Mi - cpu: 200m - name: addon-resizer image: gcr.io/google_containers/addon-resizer:1.0 resources: From d04cccc52651d9c3ffda3437a7c89f9af4dbd6ac Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Fri, 15 Sep 2017 08:05:12 +0200 Subject: [PATCH 138/638] Use grafanalib to generate Grafana dashboards --- Dockerfile | 6 + Makefile | 11 +- assets/grafana/.gitignore | 1 + assets/grafana/_grafanalib.py | 92 ++ assets/grafana/deployment-dashboard.json | 816 ---------- assets/grafana/deployment.dashboard.py | 467 ++++++ ...ubernetes-capacity-planning-dashboard.json | 1048 ------------- .../kubernetes-capacity-planning.dashboard.py | 454 ++++++ .../kubernetes-cluster-health-dashboard.json | 733 --------- .../kubernetes-cluster-health.dashboard.py | 396 +++++ .../kubernetes-cluster-status-dashboard.json | 896 ----------- .../kubernetes-cluster-status.dashboard.py | 450 ++++++ ...rnetes-control-plane-status-dashboard.json | 663 -------- ...bernetes-control-plane-status.dashboard.py | 336 ++++ ...ubernetes-resource-requests-dashboard.json | 434 ------ .../kubernetes-resource-requests.dashboard.py | 200 +++ assets/grafana/nodes-dashboard.json | 892 ----------- assets/grafana/nodes.dashboard.py | 414 +++++ assets/grafana/pods-dashboard.json | 432 ------ assets/grafana/pods.dashboard.py | 215 +++ .../bin/grafana_dashboards_generate.sh | 4 +- hack/scripts/generate-dashboards-configmap.sh | 12 +- hack/scripts/generate-manifests.sh | 3 +- manifests/grafana/grafana-dashboards.yaml | 1359 ++++++----------- requirements.txt | 1 + 25 files changed, 3504 insertions(+), 6831 deletions(-) create mode 100644 Dockerfile create mode 100644 assets/grafana/.gitignore create mode 100644 assets/grafana/_grafanalib.py delete mode 100644 assets/grafana/deployment-dashboard.json create mode 100644 assets/grafana/deployment.dashboard.py delete mode 100644 assets/grafana/kubernetes-capacity-planning-dashboard.json create mode 100644 assets/grafana/kubernetes-capacity-planning.dashboard.py delete mode 100644 assets/grafana/kubernetes-cluster-health-dashboard.json create mode 100644 assets/grafana/kubernetes-cluster-health.dashboard.py delete mode 100644 assets/grafana/kubernetes-cluster-status-dashboard.json create mode 100644 assets/grafana/kubernetes-cluster-status.dashboard.py delete mode 100644 assets/grafana/kubernetes-control-plane-status-dashboard.json create mode 100644 assets/grafana/kubernetes-control-plane-status.dashboard.py delete mode 100644 assets/grafana/kubernetes-resource-requests-dashboard.json create mode 100644 assets/grafana/kubernetes-resource-requests.dashboard.py delete mode 100644 assets/grafana/nodes-dashboard.json create mode 100644 assets/grafana/nodes.dashboard.py delete mode 100644 assets/grafana/pods-dashboard.json create mode 100644 assets/grafana/pods.dashboard.py create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..78032788 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.6-slim + +RUN apt-get update -y && apt-get install -y git +RUN pip3 install virtualenv + +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/Makefile b/Makefile index 79411408..a093ce7a 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,11 @@ -BUILDER := docker run --rm -it --workdir /data -v ${PWD}:/data debian:8 ./hack/scripts/generate-manifests.sh -generate: +.PHONY: image + +IMAGE := coreos/generate-prometheus-operator-manifests + +image: Dockerfile + docker build -t $(IMAGE) . + +BUILDER := docker run --rm -it --workdir /data -v ${PWD}:/data $(IMAGE) ./hack/scripts/generate-manifests.sh +generate: image @echo ">> Compiling assets and generating Kubernetes manifests" $(BUILDER) diff --git a/assets/grafana/.gitignore b/assets/grafana/.gitignore new file mode 100644 index 00000000..047d1277 --- /dev/null +++ b/assets/grafana/.gitignore @@ -0,0 +1 @@ +*-dashboard.json diff --git a/assets/grafana/_grafanalib.py b/assets/grafana/_grafanalib.py new file mode 100644 index 00000000..f030f101 --- /dev/null +++ b/assets/grafana/_grafanalib.py @@ -0,0 +1,92 @@ +from grafanalib import core +from grafanalib.core import Graph, Time, SparkLine, \ + Gauge, Templating, XAxis, YAxes + + +def Dashboard( + title, version, time, rows, graphTooltip=0, templating=None, +): + optional_args = {} + if templating is not None: + optional_args['templating'] = templating + return core.Dashboard( + title=title, refresh=None, schemaVersion=14, + version=version, time=time, timezone='browser', inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], rows=rows, graphTooltip=graphTooltip, **optional_args, + ) + + +def Row( + panels, height=None, title='Dashboard Row', showTitle=False, + editable=None +): + assert isinstance(height, (type(None), int)) + return core.Row( + panels=panels, height=height, title=title, showTitle=showTitle, + titleSize='h6', editable=editable, + ) + + +def SingleStat( + title, id, targets, colorValue=False, gauge=Gauge(show=True), + valueFontSize='80%', thresholds=None, valueName='avg', valueMaps=None, + rangeMaps=None, mappingTypes=None, mappingType=None, postfix=None, + sparkline=SparkLine(), prefixFontSize='50%', colors=[ + (50, 172, 45, 0.97), + (237, 129, 40, 0.89), + (245, 54, 54, 0.9), + ], span=None, format='none', transparent=None, +): + def merge_target(target): + return {**{ + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + }, **target} + targets = [merge_target(t) for t in targets] + + return core.SingleStat( + title=title, id=id, colorValue=colorValue, + dataSource='${DS_PROMETHEUS}', gauge=gauge, + valueFontSize=valueFontSize, thresholds=thresholds, + valueName=valueName, valueMaps=valueMaps, rangeMaps=rangeMaps, + mappingTypes=mappingTypes, targets=targets, + mappingType=mappingType, format=format, colors=colors, span=span, + postfix=postfix, sparkline=sparkline, prefixFontSize=prefixFontSize, + hideTimeOverride=None, transparent=transparent, + ) + + +def Graph( + id, title, targets, dashLength=None, dashes=False, spaceLength=None, + xAxis=None, yAxes=None, nullPointMode='connected', +): + def merge_target(target): + return {**{ + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, **target} + + targets = [merge_target(t) for t in targets] + assert isinstance(yAxes, YAxes) + return core.Graph( + id=id, title=title, dashLength=dashLength, dashes=dashes, + spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes, + dataSource='${DS_PROMETHEUS}', nullPointMode=nullPointMode, + ) + + +def YAxis(format='none', label='', min=0, show=True): + return core.YAxis( + format=format, label=label, min=min, show=show + ) diff --git a/assets/grafana/deployment-dashboard.json b/assets/grafana/deployment-dashboard.json deleted file mode 100644 index 1ec5a6d9..00000000 --- a/assets/grafana/deployment-dashboard.json +++ /dev/null @@ -1,816 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "200px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "hideZero": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "deployment", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 1 -} \ No newline at end of file diff --git a/assets/grafana/deployment.dashboard.py b/assets/grafana/deployment.dashboard.py new file mode 100644 index 00000000..4a2bda46 --- /dev/null +++ b/assets/grafana/deployment.dashboard.py @@ -0,0 +1,467 @@ +import sys +import os.path +sys.path.insert(0, os.path.dirname(__file__)) +from _grafanalib import * + + +dashboard = Dashboard( + title='Deployment', + version=1, + graphTooltip=1, + time=Time(start='now-6h'), + templating=Templating(list=[ + { + 'allValue': '.*', + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': 'Namespace', + 'multi': False, + 'name': 'deployment_namespace', + 'options': [], + 'query': 'label_values(kube_deployment_metadata_generation, ' + 'namespace)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': None, + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + { + 'allValue': None, + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': 'Deployment', + 'multi': False, + 'name': 'deployment_name', + 'options': [], + 'query': 'label_values(kube_deployment_metadata_generation' + '{namespace="$deployment_namespace"}, deployment)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': 'deployment', + 'type': 'query', + 'useTags': False, + }, + ]), + rows=[ + Row(panels=[ + SingleStat( + title='CPU', + id=8, + gauge=Gauge(show=False), + postfix='cores', + span=4, + valueFontSize='110%', + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + targets=[ + { + 'expr': 'sum(rate(container_cpu_usage_seconds_total' + '{namespace=\"$deployment_namespace\",pod_name=~\"' + '$deployment_name.*\"}[3m]))', + }, + ], + ), + SingleStat( + title='Memory', + id=9, + postfix='GB', + prefixFontSize='80%', + gauge=Gauge(show=False), + span=4, + valueFontSize='110%', + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': 'sum(container_memory_usage_bytes{namespace=' + '\"$deployment_namespace\",pod_name=~\"$' + 'deployment_name.*\"}) / 1024^3', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Network', + format='Bps', + gauge=Gauge(thresholdMarkers=False), + id=7, + postfix='', + span=4, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': 'sum(rate(container_network_transmit_' + 'bytes_total' + '{namespace=\"$deployment_namespace\",pod_name=~\"' + '$deployment_name.*\"}[3m])) + ' + 'sum(rate(container_network_receive_bytes_total' + '{namespace=\"$deployment_namespace\",pod_name=~' + '\"$deployment_name.*\"}[3m]))', + }, + ], + ), + ], + height=200, + ), + Row( + height=100, panels=[ + SingleStat( + title='Desired Replicas', + id=5, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + span=3, + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'metric': 'kube_deployment_spec_replicas', + 'expr': 'max(kube_deployment_spec_replicas' + '{deployment="$deployment_name",namespace=' + '"$deployment_namespace"}) without ' + '(instance, pod)', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + gauge=Gauge(thresholdMarkers=False, show=False), + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + ), + SingleStat( + title='Available Replicas', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(show=False), + id=6, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'min(kube_deployment_status_replicas_' + 'available{deployment=\"$deployment_name\",' + 'namespace=\"$deployment_namespace\"}) without ' + '(instance, pod)', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + ), + SingleStat( + title='Observed Generation', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(), + id=3, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'max(kube_deployment_status_observed_' + 'generation{deployment=\"$deployment_name\",' + 'namespace=\"$deployment_namespace\"}) without ' + '(instance, pod)', + }, + ], + rangeMaps=[ + { + 'from': "null", + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + ), + SingleStat( + title='Metadata Generation', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(show=False), + id=2, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'max(kube_deployment_metadata_generation' + '{deployment=\"$deployment_name\",namespace=\"' + '$deployment_namespace\"}) without (instance, ' + 'pod)', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + ), + ], + ), + Row( + height=350, panels=[ + Graph( + title='Replicas', + dashLength=10, + dashes=False, + id=1, + spaceLength=10, + targets=[ + { + 'expr': 'max(kube_deployment_status_replicas' + '{deployment=\"$deployment_name\",namespace=\"' + '$deployment_namespace\"}) without (instance, ' + 'pod)', + 'legendFormat': 'current replicas', + 'refId': 'A', + 'step': 30, + }, + { + 'expr': 'min(kube_deployment_status_replicas_' + 'available{deployment=\"$deployment_name\",' + 'namespace=\"$deployment_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'available', + 'refId': 'B', + 'step': 30, + }, + { + 'expr': 'max(kube_deployment_status_replicas_' + 'unavailable{deployment=\"$deployment_name\",' + 'namespace=\"$deployment_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'unavailable', + 'refId': 'C', + 'step': 30, + }, + { + 'expr': 'min(kube_deployment_status_replicas_' + 'updated{deployment=\"$deployment_name\",' + 'namespace=\"$deployment_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'updated', + 'refId': 'D', + 'step': 30, + }, + { + 'expr': 'max(kube_deployment_spec_replicas' + '{deployment=\"$deployment_name\",namespace=\"' + '$deployment_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'desired', + 'refId': 'E', + 'step': 30, + } + ], + xAxis=XAxis(mode='time'), + yAxes=YAxes( + YAxis(min=None), + YAxis(format='short', min=None, show=False), + ), + ), + ] + ), + ], +) diff --git a/assets/grafana/kubernetes-capacity-planning-dashboard.json b/assets/grafana/kubernetes-capacity-planning-dashboard.json deleted file mode 100644 index 3ea15947..00000000 --- a/assets/grafana/kubernetes-capacity-planning-dashboard.json +++ /dev/null @@ -1,1048 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 246, - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 12, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 276, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_info)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current number of Pods", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_node_status_capacity_pods)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Maximum capacity of pods", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Cluster Pod Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80,90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Capacity Planning", - "version": 4 -} \ No newline at end of file diff --git a/assets/grafana/kubernetes-capacity-planning.dashboard.py b/assets/grafana/kubernetes-capacity-planning.dashboard.py new file mode 100644 index 00000000..bf8762bd --- /dev/null +++ b/assets/grafana/kubernetes-capacity-planning.dashboard.py @@ -0,0 +1,454 @@ +from grafanalib.core import * + + +dashboard = Dashboard( + title='Kubernetes Capacity Planning', + version=4, + gnetId=22, + graphTooltip=0, + refresh=False, + schemaVersion=14, + time=Time(start='now-1h'), + timezone='browser', + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus', + } + ], + rows=[ + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Idle CPU', + id=3, + dataSource='${DS_PROMETHEUS}', + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis(format='percent', label='cpu usage',), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum(rate(node_cpu{mode=\"idle\"}[2m])) ' + '* 100', + 'hide': False, + 'intervalFactor': 10, + 'legendFormat': '', + 'refId': 'A', + 'step': 50, + }, + ], + ), + Graph( + title='System Load', + id=9, + dataSource='${DS_PROMETHEUS}', + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis(format='percentunit', min=None), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum(node_load1)', + 'intervalFactor': 4, + 'legendFormat': 'load 1m', + 'refId': 'A', + 'step': 20, + 'target': '', + }, + { + 'expr': 'sum(node_load5)', + 'intervalFactor': 4, + 'legendFormat': 'load 5m', + 'refId': 'B', + 'step': 20, + 'target': '' + }, + { + 'expr': 'sum(node_load15)', + 'intervalFactor': 4, + 'legendFormat': 'load 15m', + 'refId': 'C', + 'step': 20, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Memory Usage', + id=4, + dataSource='${DS_PROMETHEUS}', + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=9, + stack=True, + seriesOverrides=[ + { + 'alias': 'node_memory_SwapFree{instance=' + '\"172.17.0.1:9100\",job=\"prometheus\"}', + 'yaxis': 2, + } + ], + tooltip=Tooltip( + msResolution=False, valueType='individual' + ), + yAxes=YAxes( + YAxis(format='bytes', min='0'), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum(node_memory_MemTotal) - sum(node_' + 'memory_MemFree) - sum(node_memory_Buffers) - ' + 'sum(node_memory_Cached)', + 'intervalFactor': 2, + 'legendFormat': 'memory usage', + 'metric': 'memo', + 'refId': 'A', + 'step': 10, + 'target': '', + }, + { + 'expr': 'sum(node_memory_Buffers)', + 'interval': '', + 'intervalFactor': 2, + 'legendFormat': 'memory buffers', + 'metric': 'memo', + 'refId': 'B', + 'step': 10, + 'target': '', + }, + { + 'expr': 'sum(node_memory_Cached)', + 'interval': '', + 'intervalFactor': 2, + 'legendFormat': 'memory cached', + 'metric': 'memo', + 'refId': 'C', + 'step': 10, + 'target': '', + }, + { + 'expr': 'sum(node_memory_MemFree)', + 'interval': '', + 'intervalFactor': 2, + 'legendFormat': 'memory free', + 'metric': 'memo', + 'refId': 'D', + 'step': 10, + 'target': '', + }, + ], + ), + SingleStat( + title='Memory Usage', + dataSource='${DS_PROMETHEUS}', + id=5, + format='percent', + span=3, + gauge=Gauge(show=True), + thresholds='80, 90', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': '((sum(node_memory_MemTotal) - ' + 'sum(node_memory_MemFree) - sum(' + 'node_memory_Buffers) - sum(node_memory_Cached)) ' + '/ sum(node_memory_MemTotal)) * 100', + 'intervalFactor': 2, + 'metric': '', + 'refId': 'A', + 'step': 60, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=246, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Disk I/O', + dataSource='${DS_PROMETHEUS}', + id=6, + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=9, + tooltip=Tooltip(msResolution=False), + seriesOverrides=[ + { + 'alias': 'read', + 'yaxis': 1 + }, + { + 'alias': '{instance=\"172.17.0.1:9100\"}', + 'yaxis': 2, + }, + { + 'alias': 'io time', + 'yaxis': 2, + }, + ], + yAxes=YAxes( + YAxis(format='bytes', min=None), + YAxis(format='ms', min=None), + ), + targets=[ + { + 'expr': 'sum(rate(node_disk_bytes_read[5m]))', + 'hide': False, + 'intervalFactor': 4, + 'legendFormat': 'read', + 'refId': 'A', + 'step': 20, + 'target': '' + }, + { + 'expr': 'sum(rate(node_disk_bytes_written[5m]))', + 'intervalFactor': 4, + 'legendFormat': 'written', + 'refId': 'B', + 'step': 20 + }, + { + 'expr': 'sum(rate(node_disk_io_time_ms[5m]))', + 'intervalFactor': 4, + 'legendFormat': 'io time', + 'refId': 'C', + 'step': 20 + }, + ], + ), + SingleStat( + title='Disk Space Usage', + dataSource='${DS_PROMETHEUS}', + id=12, + span=3, + format='percentunit', + valueName='current', + gauge=Gauge( + maxValue=1, + show=True, + ), + thresholds='0.75, 0.9', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': '(sum(node_filesystem_size{device!=' + '\"rootfs\"}) - sum(node_filesystem_free{' + 'device!=\"rootfs\"})) / sum(node_filesystem_size' + '{device!=\"rootfs\"})', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 60, + 'target': '', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + ), + ] + ), + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Network Received', + dataSource='${DS_PROMETHEUS}', + id=8, + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + seriesOverrides=[ + { + 'alias': 'transmitted', + 'yaxis': 2, + }, + ], + yAxes=YAxes( + YAxis(format='bytes', min=None), + YAxis(format='bytes', min=None), + ), + targets=[ + { + 'expr': 'sum(rate(node_network_receive_bytes' + '{device!~\"lo\"}[5m]))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 10, + 'target': '', + }, + ], + ), + Graph( + title='Network Transmitted', + dataSource='${DS_PROMETHEUS}', + id=10, + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + seriesOverrides=[ + { + 'alias': 'transmitted', + 'yaxis': 2, + }, + ], + yAxes=YAxes( + YAxis(format='bytes', min=None), + YAxis(format='bytes', min=None), + ), + targets=[ + { + 'expr': 'sum(rate(node_network_transmit_bytes' + '{device!~\"lo\"}[5m]))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'B', + 'step': 10, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=276, title='New Row', showTitle=False, + titleSize='h6', + panels=[ + Graph( + title='Cluster Pod Utilization', + dataSource='${DS_PROMETHEUS}', + id=11, + span=9, + dashes=False, + spaceLength=11, + tooltip=Tooltip( + msResolution=False, + valueType='individual', + ), + yAxes=YAxes( + YAxis(format='short', min=None), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum(kube_pod_info)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': 'Current number of Pods', + 'refId': 'A', + 'step': 10, + }, + { + 'expr': 'sum(kube_node_status_capacity_pods)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': 'Maximum capacity of pods', + 'refId': 'B', + 'step': 10, + } + ], + ), + SingleStat( + title='Pod Utilization', + dataSource='${DS_PROMETHEUS}', + id=7, + span=3, + format='percent', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + gauge=Gauge( + show=True, + ), + thresholds='80, 90', + valueName='current', + targets=[ + { + 'expr': '100 - (sum(kube_node_status_capacity_' + 'pods) - sum(kube_pod_info)) / sum(kube_node_' + 'status_capacity_pods) * 100', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 60, + 'target': '', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + ), + ] + ), + ], +) diff --git a/assets/grafana/kubernetes-cluster-health-dashboard.json b/assets/grafana/kubernetes-cluster-health-dashboard.json deleted file mode 100644 index 46eb6ca7..00000000 --- a/assets/grafana/kubernetes-cluster-health-dashboard.json +++ /dev/null @@ -1,733 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": 254, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 1, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Control Plane Components Down", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "Everything UP and healthy", - "value": "null" - }, - { - "op": "=", - "text": "", - "value": "" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "3,5", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "3,5", - "title": "Alerts Pending", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Crashlooping Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 250, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Node Not Ready", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Node Disk Pressure", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Node Memory Pressure", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_spec_unschedulable)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Nodes Unschedulable", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes Cluster Health", - "version": 9 -} \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-health.dashboard.py b/assets/grafana/kubernetes-cluster-health.dashboard.py new file mode 100644 index 00000000..e7296f67 --- /dev/null +++ b/assets/grafana/kubernetes-cluster-health.dashboard.py @@ -0,0 +1,396 @@ +from grafanalib.core import * + + +dashboard = Dashboard( + title='Kubernetes Cluster Health', + version=9, + graphTooltip=0, + schemaVersion=14, + time=Time(start='now-6h'), + timezone='browser', + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], + rows=[ + Row( + height=254, title='Row', showTitle=False, + titleSize='h6', panels=[ + SingleStat( + title='Control Plane Components Down', + id=1, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + span=3, + thresholds='1, 3', + colorValue=True, + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'Everything UP and healthy', + 'value': 'null', + }, + { + 'op': '=', + 'text': '', + 'value': '', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' + 'kube-controller-manager"} == 0)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Alerts Firing', + id=2, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(ALERTS{alertstate="firing",' + 'alertname!="DeadMansSwitch"})', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Alerts Pending', + id=3, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='3, 5', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(ALERTS{alertstate="pending",' + 'alertname!="DeadMansSwitch"})', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Crashlooping Pods', + id=4, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'count(increase(kube_pod_container_' + 'status_restarts[1h]) > 5)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + ], + ), + Row( + height=250, title='Row', showTitle=False, + titleSize='h6', panels=[ + SingleStat( + title='Node Not Ready', + id=5, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(kube_node_status_condition{' + 'condition="Ready",status!="true"})', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Node Disk Pressure', + id=6, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(kube_node_status_condition' + '{condition="DiskPressure",status="true"})', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Node Memory Pressure', + id=7, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(kube_node_status_condition' + '{condition="MemoryPressure",status="true"})', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Nodes Unschedulable', + id=8, + dataSource='${DS_PROMETHEUS}', + gauge=Gauge(), + colorValue=True, + span=3, + valueName='current', + thresholds='1, 3', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'sum(kube_node_spec_unschedulable)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ], + ), + ], + ), + ], +) diff --git a/assets/grafana/kubernetes-cluster-status-dashboard.json b/assets/grafana/kubernetes-cluster-status-dashboard.json deleted file mode 100644 index d30906bd..00000000 --- a/assets/grafana/kubernetes-cluster-status-dashboard.json +++ /dev/null @@ -1,896 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": 129, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Control Plane UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "UP", - "value": "null" - } - ], - "valueName": "total" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "3,5", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Cluster Health", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 168, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 1, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "Controller Managers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "Schedulers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1,3", - "title": "Crashlooping Control Plane Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Control Plane Status", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 158, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "80,90", - "title": "CPU Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "80,90", - "title": "Memory Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "80,90", - "title": "Filesystem Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 10, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "80,90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Capacity Planing", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes Cluster Status", - "version": 3 -} \ No newline at end of file diff --git a/assets/grafana/kubernetes-cluster-status.dashboard.py b/assets/grafana/kubernetes-cluster-status.dashboard.py new file mode 100644 index 00000000..7288c522 --- /dev/null +++ b/assets/grafana/kubernetes-cluster-status.dashboard.py @@ -0,0 +1,450 @@ +import sys +import os.path +sys.path.insert(0, os.path.dirname(__file__)) +from _grafanalib import * + + +dashboard = Dashboard( + title='Kubernetes Cluster Status', + version=3, + time=Time(start='now-6h'), + rows=[ + Row( + height=129, title='Cluster Health', showTitle=True, + panels=[ + SingleStat( + title='Control Plane UP', + id=5, + gauge=Gauge(show=False), + colorValue=True, + mappingType=1, + thresholds='1, 3', + valueName='total', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'UP', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' + 'kube-controller-manager"} == 0)', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Alerts Firing', + id=6, + gauge=Gauge(show=False), + colorValue=True, + mappingType=1, + thresholds='3, 5', + valueName='current', + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + }, + ], + targets=[ + { + 'expr': 'sum(ALERTS{alertstate="firing",' + 'alertname!="DeadMansSwitch"})', + 'format': 'time_series', + }, + ] + ), + ], + ), + Row( + height=168, title='Control Plane Status', showTitle=True, + panels=[ + SingleStat( + title='API Servers UP', + id=1, + mappingType=1, + format='percent', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + thresholds='50, 80', + span=3, + valueName='current', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': '(sum(up{job="apiserver"} == 1) / ' + 'count(up{job="apiserver"})) * 100', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Controller Managers UP', + id=2, + span=3, + mappingType=1, + thresholds='50, 80', + format='percent', + valueName='current', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + targets=[ + { + 'expr': '(sum(up{job="kube-controller-manager"} ==' + ' 1) / count(up{job="kube-controller-manager"})) ' + '* 100', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Schedulers UP', + id=3, + span=3, + mappingType=1, + format='percent', + thresholds='50, 80', + valueName='current', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + targets=[ + { + 'expr': '(sum(up{job="kube-scheduler"} == 1) / ' + 'count(up{job="kube-scheduler"})) * 100', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Crashlooping Control Plane Pods', + id=4, + colorValue=True, + gauge=Gauge(show=False), + span=3, + mappingType=1, + thresholds='1, 3', + valueName='current', + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + }, + ], + targets=[ + { + 'expr': 'count(increase(kube_pod_container_' + 'status_restarts{namespace=~"kube-system|' + 'tectonic-system"}[1h]) > 5)', + 'format': 'time_series', + }, + ] + ), + ], + ), + Row( + height=158, title='Capacity Planning', showTitle=True, + panels=[ + SingleStat( + title='CPU Utilization', + id=8, + format='percent', + mappingType=1, + span=3, + thresholds='80, 90', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'sum(100 - (avg by (instance) (rate(' + 'node_cpu{job="node-exporter",mode="idle"}[5m])) ' + '* 100)) / count(node_cpu{job="node-exporter",' + 'mode="idle"})', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Memory Utilization', + id=7, + format='percent', + span=3, + mappingType=1, + thresholds='80, 90', + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': '((sum(node_memory_MemTotal) - sum(' + 'node_memory_MemFree) - sum(node_memory_Buffers) ' + '- sum(node_memory_Cached)) / sum(' + 'node_memory_MemTotal)) * 100', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Filesystem Utilization', + id=9, + span=3, + format='percent', + mappingType=1, + thresholds='80, 90', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': '(sum(node_filesystem_size{device!=' + '"rootfs"}) - sum(node_filesystem_free{device!=' + '"rootfs"})) / sum(node_filesystem_size{device!=' + '"rootfs"})', + 'format': 'time_series', + }, + ] + ), + SingleStat( + title='Pod Utilization', + id=10, + gauge=Gauge(show=True), + span=3, + mappingType=1, + format='percent', + thresholds='80, 90', + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': '100 - (sum(kube_node_status_capacity_pods' + ') - sum(kube_pod_info)) / sum(kube_node_status_' + 'capacity_pods) * 100', + 'format': 'time_series', + }, + ] + ), + ], + ), + ], +) diff --git a/assets/grafana/kubernetes-control-plane-status-dashboard.json b/assets/grafana/kubernetes-control-plane-status-dashboard.json deleted file mode 100644 index 47721922..00000000 --- a/assets/grafana/kubernetes-control-plane-status-dashboard.json +++ /dev/null @@ -1,663 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 1, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "Controller Mangers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50,80", - "title": "Schedulers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "5,10", - "title": "API Server Request Error Rate", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 250, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "API Server Request Latency", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 250, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 60 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "End to end scheduling latency", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "dtdurations", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Error Rate", - "refId": "A", - "step": 60 - }, - { - "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Request Rate", - "refId": "B", - "step": 60 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "API Server Request Rates", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes Control Plane Status", - "version": 3 -} \ No newline at end of file diff --git a/assets/grafana/kubernetes-control-plane-status.dashboard.py b/assets/grafana/kubernetes-control-plane-status.dashboard.py new file mode 100644 index 00000000..b9020675 --- /dev/null +++ b/assets/grafana/kubernetes-control-plane-status.dashboard.py @@ -0,0 +1,336 @@ +from grafanalib.core import * + +dashboard = Dashboard( + title='Kubernetes Control Plane Status', + version=3, + graphTooltip=0, + schemaVersion=14, + time=Time(start='now-6h'), + timezone='browser', + refresh=None, + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], + rows=[ + Row( + title='Dashboard Row', showTitle=False, titleSize='h6', + panels=[ + SingleStat( + title='API Servers UP', + dataSource='${DS_PROMETHEUS}', + format='percent', + gauge=Gauge( + show=True, + ), + id=1, + span=3, + thresholds='50, 80', + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': '(sum(up{job=\"apiserver\"} == 1) / ' + 'sum(up{job=\"apiserver\"})) * 100', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + }, + ] + ), + SingleStat( + title='Controller Managers UP', + dataSource='${DS_PROMETHEUS}', + format='percent', + gauge=Gauge( + show=True, + ), + id=2, + span=3, + thresholds='50, 80', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + rangeMaps=([ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ]), + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': '(sum(up{job=\"kube-controller-manager\"}' + ' == 1) / sum(up{job=\"kube-controller-manager\"' + '})) * 100', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + } + ] + ), + SingleStat( + title='Schedulers UP', + dataSource='${DS_PROMETHEUS}', + format='percent', + gauge=Gauge( + show=True, + ), + id=3, + span=3, + thresholds='50, 80', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + rangeMaps=([ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ]), + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': '(sum(up{job=\"kube-scheduler\"} == 1) ' + '/ sum(up{job=\"kube-scheduler\"})) * 100', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + } + ] + ), + SingleStat( + title='API Server Request Error Rate', + dataSource='${DS_PROMETHEUS}', + format='percent', + gauge=Gauge( + show=True, + ), + id=4, + span=3, + thresholds='5, 10', + valueMaps=[ + { + 'op': '=', + 'text': '0', + 'value': 'null', + } + ], + rangeMaps=([ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ]), + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + } + ], + targets=[ + { + 'expr': 'max(sum by(instance) (rate(' + 'apiserver_request_count{code=~"5.."}[5m])) / ' + 'sum by(instance) (rate(apiserver_request_count' + '[5m]))) * 100', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 600, + }, + ] + ), + ], + ), + Row( + title='Dashboard Row', showTitle=False, titleSize='h6', + panels=[ + Graph( + title='API Server Request Latency', + id=7, + dataSource='${DS_PROMETHEUS}', + dashLength=10, + dashes=False, + isNew=False, + lineWidth=1, + nullPointMode='null', + tooltip=Tooltip( + msResolution=False, valueType='individual', + ), + spaceLength=10, + yAxes=YAxes( + YAxis(format='short', min=None), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum by(verb) (rate(apiserver_latency_' + 'seconds:quantile[5m]) >= 0)', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 30, + } + ], + ), + ], + ), + Row( + title='Dashboard Row', showTitle=False, titleSize='h6', + panels=[ + Graph( + title='End to End Scheduling Latency', + id=5, + dataSource='${DS_PROMETHEUS}', + isNew=False, + dashLength=10, + lineWidth=1, + nullPointMode="null", + spaceLength=10, + span=6, + dashes=False, + tooltip=Tooltip( + msResolution=False, + valueType='individual', + ), + yAxes=YAxes( + YAxis(format='short', min=None), + YAxis(format='dtdurations', min=None), + ), + targets=[ + { + 'expr': 'cluster:scheduler_e2e_scheduling_' + 'latency_seconds:quantile', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 60, + } + ], + ), + Graph( + title='API Server Request Rates', + id=6, + dataSource='${DS_PROMETHEUS}', + isNew=False, + dashLength=10, + lineWidth=1, + nullPointMode="null", + spaceLength=10, + span=6, + dashes=False, + tooltip=Tooltip( + msResolution=False, + valueType='individual', + ), + yAxes=YAxes( + YAxis(format='short', min=None), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum by(instance) (rate(apiserver_' + 'request_count{code!~\"2..\"}[5m]))', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': 'Error Rate', + 'refId': 'A', + 'step': 60, + }, + { + 'expr': 'sum by(instance) (rate(apiserver_' + 'request_count[5m]))', + 'format': 'time_series', + 'intervalFactor': 2, + 'legendFormat': 'Request Rate', + 'refId': 'B', + 'step': 60, + }, + ], + ), + ], + ), + ], +) diff --git a/assets/grafana/kubernetes-resource-requests-dashboard.json b/assets/grafana/kubernetes-resource-requests-dashboard.json deleted file mode 100644 index fe52cba7..00000000 --- a/assets/grafana/kubernetes-resource-requests-dashboard.json +++ /dev/null @@ -1,434 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "", - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "300", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Resource Requests", - "version": 2 -} \ No newline at end of file diff --git a/assets/grafana/kubernetes-resource-requests.dashboard.py b/assets/grafana/kubernetes-resource-requests.dashboard.py new file mode 100644 index 00000000..3b3b6157 --- /dev/null +++ b/assets/grafana/kubernetes-resource-requests.dashboard.py @@ -0,0 +1,200 @@ +from grafanalib.core import * + + +dashboard = Dashboard( + title='Kubernetes Resource Requests', + version=2, + graphTooltip=0, + refresh=False, + schemaVersion=14, + time=Time(start='now-3h'), + timezone='browser', + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], + rows=[ + Row( + height=300, title='CPU Cores', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='CPU Cores', + description='This represents the total [CPU resource ' + 'requests](https://kubernetes.io/docs/concepts/configu' + 'ration/manage-compute-resources-container/#meaning-of-' + 'cpu) in the cluster.\nFor comparison the total ' + '[allocatable CPU cores](https://github.com/kubernetes/' + 'community/blob/master/contributors/design-proposals/' + 'node-allocatable.md) is also shown.', + id=1, + dataSource='${DS_PROMETHEUS}', + dashLength=10, + dashes=False, + isNew=False, + lineWidth=1, + spaceLength=10, + nullPointMode='null', + span=9, + tooltip=Tooltip( + msResolution=False, valueType='individual' + ), + yAxes=YAxes( + YAxis(format='short', label='CPU Cores', min=None,), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'min(sum(kube_node_status_allocatable_' + 'cpu_cores) by (instance))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': 'Allocatable CPU Cores', + 'refId': 'A', + 'step': 20, + }, + { + 'expr': 'max(sum(kube_pod_container_resource_' + 'requests_cpu_cores) by (instance))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': 'Requested CPU Cores', + 'refId': 'B', + 'step': 20, + }, + ], + ), + SingleStat( + title='CPU Cores', + dataSource='${DS_PROMETHEUS}', + id=2, + format='percent', + span=3, + gauge=Gauge(show=True), + sparkline=SparkLine(show=True), + valueFontSize='110%', + thresholds='80, 90', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': 'max(sum(kube_pod_container_resource_' + 'requests_cpu_cores) by (instance)) / min(sum' + '(kube_node_status_allocatable_cpu_cores) by ' + '(instance)) * 100', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 240, + }, + ], + ), + ], + ), + Row( + height=300, title='Memory', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Memory', + id=3, + dataSource='${DS_PROMETHEUS}', + description='This represents the total [memory resource ' + 'requests](https://kubernetes.io/docs/concepts/' + 'configuration/manage-compute-resources-container/' + '#meaning-of-memory) in the cluster.\nFor comparison ' + 'the total [allocatable memory](https://github.com/' + 'kubernetes/community/blob/master/contributors/' + 'design-proposals/node-allocatable.md) is also shown.', + dashLength=10, + dashes=False, + lineWidth=1, + isNew=False, + spaceLength=10, + span=9, + nullPointMode='null', + tooltip=Tooltip( + msResolution=False, valueType='individual' + ), + yAxes=YAxes( + YAxis(format='bytes', label='Memory', min=None), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'min(sum(kube_node_status_allocatable_' + 'memory_bytes) by (instance))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': 'Allocatable Memory', + 'refId': 'A', + 'step': 20, + }, + { + 'expr': 'max(sum(kube_pod_container_resource_' + 'requests_memory_bytes) by (instance))', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': 'Requested Memory', + 'refId': 'B', + 'step': 20, + }, + ], + ), + SingleStat( + title='Memory', + dataSource='${DS_PROMETHEUS}', + id=4, + format='percent', + span=3, + gauge=Gauge(show=True), + sparkline=SparkLine(show=True), + valueFontSize='110%', + thresholds='80, 90', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + targets=[ + { + 'expr': 'max(sum(kube_pod_container_resource_' + 'requests_memory_bytes) by (instance)) / ' + 'min(sum(kube_node_status_allocatable_memory_' + 'bytes) by (instance)) * 100', + 'intervalFactor': 2, + 'legendFormat': '', + 'refId': 'A', + 'step': 240, + }, + ], + ), + ], + ), + ], +) diff --git a/assets/grafana/nodes-dashboard.json b/assets/grafana/nodes-dashboard.json deleted file mode 100644 index 7ab526c3..00000000 --- a/assets/grafana/nodes-dashboard.json +++ /dev/null @@ -1,892 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": true, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 10 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 10 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk space usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "alerting": {}, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted ", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 2 -} \ No newline at end of file diff --git a/assets/grafana/nodes.dashboard.py b/assets/grafana/nodes.dashboard.py new file mode 100644 index 00000000..f2e7b18e --- /dev/null +++ b/assets/grafana/nodes.dashboard.py @@ -0,0 +1,414 @@ +from grafanalib.core import * + + +dashboard = Dashboard( + title='Nodes', + version=2, + description='Dashboard to get an overview of one server', + gnetId=22, + graphTooltip=0, + refresh=False, + schemaVersion=14, + time=Time(start='now-1h'), + timezone='browser', + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], + templating=Templating(list=[ + { + 'allValue': None, + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': None, + 'multi': False, + 'name': 'server', + 'options': [], + 'query': 'label_values(node_boot_time, instance)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + ]), + rows=[ + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Idle CPU', + dataSource='${DS_PROMETHEUS}', + id=3, + isNew=False, + spaceLength=10, + span=6, + dashLength=10, + dashes=False, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis( + format='percent', + label='cpu usage', + max=100, + ), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': '100 - (avg by (cpu) (irate(node_cpu' + '{mode=\"idle\", instance=\"$server\"}[5m])) ' + '* 100)', + 'hide': False, + 'intervalFactor': 10, + 'legendFormat': '{{cpu}}', + 'refId': 'A', + 'step': 50, + } + ], + ), + Graph( + title='System Load', + dataSource='${DS_PROMETHEUS}', + id=9, + isNew=False, + spaceLength=10, + span=6, + dashLength=10, + dashes=False, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis(format='percentunit', min=None,), + YAxis(format='short', min=None,), + ), + targets=[ + { + 'expr': 'node_load1{instance=\"$server\"}', + 'intervalFactor': 4, + 'legendFormat': 'load 1m', + 'refId': 'A', + 'step': 20, + 'target': '', + }, + { + 'expr': 'node_load5{instance=\"$server\"}', + 'intervalFactor': 4, + 'legendFormat': 'load 5m', + 'refId': 'B', + 'step': 20, + 'target': '', + }, + { + 'expr': 'node_load15{instance=\"$server\"}', + 'intervalFactor': 4, + 'legendFormat': 'load 15m', + 'refId': 'C', + 'step': 20, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Memory Usage', + dataSource='${DS_PROMETHEUS}', + id=4, + isNew=False, + spaceLength=10, + span=9, + stack=True, + dashLength=10, + dashes=False, + tooltip=Tooltip( + msResolution=False, valueType='individual', + ), + seriesOverrides=[ + { + 'alias': 'node_memory_SwapFree{instance=' + '\"172.17.0.1:9100\",job=\"prometheus\"}', + 'yaxis': 2, + }, + ], + yAxes=YAxes( + YAxis(format='bytes', min='0',), + YAxis(format='short', min=None,), + ), + targets=[ + { + 'expr': 'node_memory_MemTotal{instance=' + '\"$server\"} - node_memory_MemFree{instance=' + '\"$server\"} - node_memory_Buffers{instance=' + '\"$server\"} - node_memory_Cached{instance=' + '\"$server\"}', + 'hide': False, + 'interval': '', + 'intervalFactor': 2, + 'legendFormat': 'memory used', + 'metric': '', + 'refId': 'C', + 'step': 10, + }, + { + 'expr': 'node_memory_Buffers{instance=' + '\"$server\"}', + 'interval': '', + 'intervalFactor': 2, + 'legendFormat': 'memory buffers', + 'metric': '', + 'refId': 'E', + 'step': 10, + }, + { + 'expr': 'node_memory_Cached{instance=\"$server\"}', + 'intervalFactor': 2, + 'legendFormat': 'memory cached', + 'metric': '', + 'refId': 'F', + 'step': 10, + }, + { + 'expr': 'node_memory_MemFree{instance=' + '\"$server\"}', + 'intervalFactor': 2, + 'legendFormat': 'memory free', + 'metric': '', + 'refId': 'D', + 'step': 10, + }, + ], + ), + SingleStat( + title='Memory Usage', + dataSource='${DS_PROMETHEUS}', + id=5, + format='percent', + gauge=Gauge(show=True), + span=3, + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + } + ], + thresholds='80, 90', + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + targets=[ + { + 'expr': '((node_memory_MemTotal{instance=' + '\"$server\"} - node_memory_MemFree{instance=' + '\"$server\"} - node_memory_Buffers{instance=' + '\"$server\"} - node_memory_Cached{instance=' + '\"$server\"}) / node_memory_MemTotal{instance=' + '\"$server\"}) * 100', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 60, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=250, titleSize='h6', title='New Row', + showTitle=False, panels=[ + Graph( + title='Disk I/O', + dataSource='${DS_PROMETHEUS}', + id=6, + dashLength=10, + dashes=False, + spaceLength=10, + span=9, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis( + format='bytes', + min=None, + ), + YAxis( + format='ms', + min=None, + ), + ), + seriesOverrides=[ + { + 'alias': 'read', + 'yaxis': 1, + }, + { + 'alias': '{instance=\"172.17.0.1:9100\"}', + 'yaxis': 2, + }, + { + 'alias': 'io time', + 'yaxis': 2, + }, + ], + targets=[ + { + 'expr': 'sum by (instance) (rate(node_disk_' + 'bytes_read{instance=\"$server\"}[2m]))', + 'hide': False, + 'intervalFactor': 4, + 'legendFormat': 'read', + 'refId': 'A', + 'step': 20, + 'target': '', + }, + { + 'expr': 'sum by (instance) (rate(node_disk_' + 'bytes_written{instance=\"$server\"}[2m]))', + 'intervalFactor': 4, + 'legendFormat': 'written', + 'refId': 'B', + 'step': 20 + }, + { + 'expr': 'sum by (instance) (rate(node_disk_io_' + 'time_ms{instance=\"$server\"}[2m]))', + 'intervalFactor': 4, + 'legendFormat': 'io time', + 'refId': 'C', + 'step': 20, + }, + ], + ), + SingleStat( + title='Disk Space Usage', + dataSource='${DS_PROMETHEUS}', + id=7, + thresholds='0.75, 0.9', + valueName='current', + format='percentunit', + span=3, + gauge=Gauge( + maxValue=1, + show=True, + ), + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + targets=[ + { + 'expr': '(sum(node_filesystem_size{device!=' + '\"rootfs\",instance=\"$server\"}) - ' + 'sum(node_filesystem_free{device!=\"rootfs\",' + 'instance=\"$server\"})) / sum(node_filesystem_' + 'size{device!=\"rootfs\",instance=\"$server\"})', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 60, + 'target': '', + }, + ], + ), + ], + ), + Row( + height=250, title='New Row', titleSize='h6', + showTitle=False, + panels=[ + Graph( + title='Network Received', + dataSource='${DS_PROMETHEUS}', + id=8, + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis(format='bytes', min=None), + YAxis(format='bytes', min=None), + ), + seriesOverrides=[ + { + 'alias': 'transmitted', + 'yaxis': 2, + }, + ], + targets=[ + { + 'expr': 'rate(node_network_receive_bytes{' + 'instance=\"$server\",device!~\"lo\"}[5m])', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': '{{device}}', + 'refId': 'A', + 'step': 10, + 'target': '' + } + ], + ), + Graph( + title='Network Transmitted', + dataSource='${DS_PROMETHEUS}', + id=10, + dashLength=10, + dashes=False, + isNew=False, + spaceLength=10, + span=6, + tooltip=Tooltip(msResolution=False), + yAxes=YAxes( + YAxis(format='bytes', min=None), + YAxis(format='bytes', min=None), + ), + seriesOverrides=[ + { + 'alias': 'transmitted', + 'yaxis': 2, + }, + ], + targets=[ + { + 'expr': 'rate(node_network_transmit_bytes' + '{instance=\"$server\",device!~\"lo\"}[5m])', + 'hide': False, + 'intervalFactor': 2, + 'legendFormat': '{{device}}', + 'refId': 'B', + 'step': 10, + 'target': '', + }, + ], + ), + ], + ), + ], +) diff --git a/assets/grafana/pods-dashboard.json b/assets/grafana/pods-dashboard.json deleted file mode 100644 index 2d3c1c84..00000000 --- a/assets/grafana/pods-dashboard.json +++ /dev/null @@ -1,432 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "hideControls": false, - "id": null, - "links": [], - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 1, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 15 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 1 -} \ No newline at end of file diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py new file mode 100644 index 00000000..1003d8a5 --- /dev/null +++ b/assets/grafana/pods.dashboard.py @@ -0,0 +1,215 @@ +from grafanalib.core import * + + +dashboard = Dashboard( + title='Pods', + version=1, + graphTooltip=1, + refresh=False, + schemaVersion=14, + time=Time(start='now-6h'), + timezone='browser', + inputs=[ + { + 'name': 'DS_PROMETHEUS', + 'label': 'prometheus', + 'description': '', + 'type': 'datasource', + 'pluginId': 'prometheus', + 'pluginName': 'Prometheus' + }, + ], + templating=Templating(list=[ + { + 'allValue': '.*', + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': True, + 'label': 'Namespace', + 'multi': False, + 'name': 'namespace', + 'options': [], + 'query': 'label_values(kube_pod_info, namespace)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + { + 'allValue': None, + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': 'Pod', + 'multi': False, + 'name': 'pod', + 'options': [], + 'query': 'label_values(kube_pod_info{namespace=~"$namespace"}, ' + 'pod)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + { + 'allValue': '.*', + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': True, + 'label': 'Container', + 'multi': False, + 'name': 'container', + 'options': [], + 'query': 'label_values(kube_pod_container_info{namespace=' + '"$namespace", pod="$pod"}, container)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + ]), + rows=[ + Row( + height=250, title='Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Memory Usage', + dataSource='${DS_PROMETHEUS}', + id=1, + isNew=False, + spaceLength=10, + span=12, + dashLength=10, + dashes=False, + tooltip=Tooltip(msResolution=True, valueType='cumulative'), + legend=Legend( + alignAsTable=True, avg=True, current=True, + rightSide=True, total=False, values=True, + ), + yAxes=YAxes( + YAxis( + format='bytes', min=None, + ), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum by(container_name) (container_' + 'memory_usage_bytes{pod_name="$pod", ' + 'container_name=~"$container", ' + 'container_name!="POD"})', + 'interval': '10s', + 'intervalFactor': 1, + 'legendFormat': 'Current: {{ container_name }}', + 'metric': 'container_memory_usage_bytes', + 'refId': 'A', + 'step': 15, + }, + { + 'expr': 'kube_pod_container_resource_requests_' + 'memory_bytes{pod="$pod", container=~' + '"$container"}', + 'interval': '10s', + 'intervalFactor': 2, + 'legendFormat': 'Requested: {{ container }}', + 'metric': 'kube_pod_container_resource_' + 'requests_memory_bytes', + 'refId': 'B', + 'step': 20, + }, + ], + ), + ], + ), + Row( + height=250, title='Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='CPU Usage', + dataSource='${DS_PROMETHEUS}', + id=2, + isNew=False, + spaceLength=10, + span=12, + dashLength=10, + dashes=False, + legend=Legend( + alignAsTable=True, avg=True, current=True, + rightSide=True, total=False, values=True, + ), + tooltip=Tooltip(msResolution=True, valueType='cumulative'), + yAxes=YAxes( + YAxis( + format='short', min=None, + ), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sum by (container_name)(' + 'rate(container_cpu_usage_seconds_total' + '{image!="",container_name!="POD",pod_name=' + '"$pod"}[1m]))', + 'intervalFactor': 2, + 'legendFormat': '{{ container_name }}', + 'refId': 'A', + 'step': 30 + }, + ], + ), + ], + ), + Row( + height=250, title='New Row', showTitle=False, + titleSize='h6', panels=[ + Graph( + title='Network I/O', + dataSource='${DS_PROMETHEUS}', + id=3, + isNew=False, + spaceLength=10, + span=12, + dashLength=10, + dashes=False, + legend=Legend( + alignAsTable=True, avg=True, current=True, + rightSide=True, total=False, values=True, + ), + tooltip=Tooltip(msResolution=True, valueType='cumulative'), + yAxes=YAxes( + YAxis( + format='bytes', min=None, + ), + YAxis(format='short', min=None), + ), + targets=[ + { + 'expr': 'sort_desc(sum by (pod_name) (rate' + '(container_network_receive_bytes_total{' + 'pod_name="$pod"}[1m])))', + 'intervalFactor': 2, + 'legendFormat': '{{ pod_name }}', + 'refId': 'A', + 'step': 30 + }, + ], + ), + ], + ), + ], +) diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index 0a37c491..b4273baa 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -226,7 +226,7 @@ bin-pack-files() { # the max size of the queue is limited by DATA_SIZE_LIMIT # while there's room available in the queue we add files. # when there's no room we create a configmap with the members of the queue - # before adding the file to a cleaned queue + # before adding the file to a cleaned queue # Counters initialization is not in the scope of this function local file="" @@ -313,7 +313,7 @@ touch $GRAFANA_OUTPUT_FILE || { echo "ERROR: Unable to create or modify $GRAFANA echo "# Starting execution of $SCRIPT_BASE on $DATE_EXEC" echo "# Configured size limit: $DATA_SIZE_LIMIT bytes" -echo "# Grafna input dashboards and datasources will be read from: $DASHBOARDS_DIR" +echo "# Grafana input dashboards and datasources will be read from: $DASHBOARDS_DIR" echo "# Grafana Dashboards ConfigMap will be created into file:" echo "$OUTPUT_FILE" echo "# Grafana Deployment manifest will be created into file:" diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index 78ad27ac..5b0ed3e8 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -1,13 +1,23 @@ #!/bin/bash +set -e cat <<-EOF apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboards + name: grafana-dashboards-0 data: EOF +virtualenv -p python3 .env +source .env/bin/activate +pip install -Ur requirements.txt +for f in assets/grafana/*.dashboard.py +do + JSON_FILENAME="$(pwd)/${f%%.*}-dashboard.json" + generate-dashboard $f -o $JSON_FILENAME 2>&1 > /dev/null +done + for f in assets/grafana/*-dashboard.json do echo " $(basename $f): |+" diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index bb0c0685..7f300dac 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -1,10 +1,11 @@ #!/bin/bash +set -e # Generate Alert Rules ConfigMap hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap -#hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml +hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml # Generate Dashboard ConfigMap with configmap-generator tool # Max Size per ConfigMap: 240000 diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 13dc4e64..e76a1103 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -9,48 +9,20 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, "editable": true, - "gnetId": null, "graphTooltip": 1, "hideControls": false, - "id": null, "links": [], "rows": [ { @@ -58,7 +30,6 @@ data: "height": "200px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -68,7 +39,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -78,7 +48,6 @@ data: "thresholdMarkers": true }, "id": 8, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -93,7 +62,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "cores", "postfixFontSize": "50%", "prefix": "", @@ -112,16 +80,14 @@ data: "lineColor": "rgb(31, 120, 193)", "show": true }, - "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", "intervalFactor": 2, "refId": "A", "step": 600 } ], - "thresholds": "", "title": "CPU", "type": "singlestat", "valueFontSize": "110%", @@ -135,7 +101,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -145,7 +110,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -155,7 +119,6 @@ data: "thresholdMarkers": true }, "id": 9, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -170,7 +133,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "GB", "postfixFontSize": "50%", "prefix": "", @@ -189,7 +151,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": true }, - "tableColumn": "", "targets": [ { "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", @@ -198,7 +159,6 @@ data: "step": 600 } ], - "thresholds": "", "title": "Memory", "type": "singlestat", "valueFontSize": "110%", @@ -212,7 +172,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -222,7 +181,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "Bps", "gauge": { "maxValue": 100, @@ -232,7 +190,6 @@ data: "thresholdMarkers": false }, "id": 7, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -247,7 +204,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -266,16 +222,14 @@ data: "lineColor": "rgb(31, 120, 193)", "show": true }, - "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", "intervalFactor": 2, "refId": "A", "step": 600 } ], - "thresholds": "", "title": "Network", "type": "singlestat", "valueFontSize": "80%", @@ -289,11 +243,8 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "Row", + "title": "Dashboard Row", "titleSize": "h6" }, { @@ -301,7 +252,6 @@ data: "height": "100px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -310,9 +260,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -322,7 +270,6 @@ data: "thresholdMarkers": false }, "id": 5, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -337,8 +284,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -356,7 +301,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", @@ -366,7 +310,6 @@ data: "step": 600 } ], - "thresholds": "", "title": "Desired Replicas", "type": "singlestat", "valueFontSize": "80%", @@ -380,7 +323,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -390,7 +332,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -400,7 +341,6 @@ data: "thresholdMarkers": true }, "id": 6, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -415,8 +355,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -434,7 +372,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", @@ -443,7 +380,6 @@ data: "step": 600 } ], - "thresholds": "", "title": "Available Replicas", "type": "singlestat", "valueFontSize": "80%", @@ -457,7 +393,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -467,7 +402,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -477,7 +411,6 @@ data: "thresholdMarkers": true }, "id": 3, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -492,8 +425,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -511,17 +442,14 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "", "title": "Observed Generation", "type": "singlestat", "valueFontSize": "80%", @@ -535,7 +463,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -545,7 +472,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "none", "gauge": { "maxValue": 100, @@ -555,7 +481,6 @@ data: "thresholdMarkers": true }, "id": 2, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -570,8 +495,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -589,17 +512,14 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "", "title": "Metadata Generation", "type": "singlestat", "valueFontSize": "80%", @@ -613,11 +533,8 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "Dashboard Row", "titleSize": "h6" }, { @@ -633,17 +550,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 1, + "isNew": true, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -695,9 +618,6 @@ data: "step": 30 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Replicas", "tooltip": { "msResolution": true, @@ -705,12 +625,9 @@ data: "sort": 0, "value_type": "cumulative" }, - "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -719,30 +636,24 @@ data: "format": "none", "label": "", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, + "label": "", "logBase": 1, - "max": null, - "min": null, "show": false } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -821,7 +732,8 @@ data: "timezone": "browser", "title": "Deployment", "version": 1 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -838,58 +750,30 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, - "description": "", "editable": true, "gnetId": 22, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -898,16 +782,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 3, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -932,10 +823,7 @@ data: "step": 50 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", + "title": "Idle CPU", "tooltip": { "msResolution": false, "shared": true, @@ -944,9 +832,7 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -955,22 +841,17 @@ data: "format": "percent", "label": "cpu usage", "logBase": 1, - "max": null, "min": 0, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -979,16 +860,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 9, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -1029,10 +917,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", + "title": "System Load", "tooltip": { "msResolution": false, "shared": true, @@ -1041,45 +926,34 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -1088,16 +962,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 4, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -1158,10 +1039,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", + "title": "Memory Usage", "tooltip": { "msResolution": false, "shared": true, @@ -1170,33 +1048,25 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -1206,7 +1076,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "percent", "gauge": { "maxValue": 100, @@ -1215,8 +1084,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 5, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -1231,7 +1100,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -1250,7 +1118,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", @@ -1262,7 +1129,8 @@ data: } ], "thresholds": "80, 90", - "title": "Memory usage", + "title": "Memory Usage", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -1275,19 +1143,16 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, - "height": 246, + "editable": true, + "height": "246px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -1296,16 +1161,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 6, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -1358,9 +1230,6 @@ data: "step": 20 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Disk I/O", "tooltip": { "msResolution": false, @@ -1370,33 +1239,24 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "ms", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -1406,7 +1266,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "percentunit", "gauge": { "maxValue": 1, @@ -1415,8 +1274,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 12, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -1431,7 +1290,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -1450,7 +1308,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", @@ -1461,7 +1318,8 @@ data: } ], "thresholds": "0.75, 0.9", - "title": "Disk space usage", + "title": "Disk Space Usage", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -1474,19 +1332,16 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -1495,16 +1350,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 8, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -1516,7 +1378,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "transmitted ", + "alias": "transmitted", "yaxis": 2 } ], @@ -1535,10 +1397,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", + "title": "Network Received", "tooltip": { "msResolution": false, "shared": true, @@ -1547,33 +1406,24 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -1582,16 +1432,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 10, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -1603,7 +1460,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "transmitted ", + "alias": "transmitted", "yaxis": 2 } ], @@ -1622,10 +1479,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", + "title": "Network Transmitted", "tooltip": { "msResolution": false, "shared": true, @@ -1634,70 +1488,69 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, - "height": 276, + "editable": true, + "height": "276px", "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 11, + "isNew": true, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [], - "nullPointMode": "null", + "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], - "spaceLength": 10, + "spaceLength": 11, "span": 9, "stack": false, "steppedLine": false, @@ -1719,44 +1572,33 @@ data: "step": 10 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Cluster Pod Utilization", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -1766,7 +1608,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "percent", "gauge": { "maxValue": 100, @@ -1775,8 +1616,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 7, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -1791,7 +1632,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -1810,7 +1650,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", @@ -1822,8 +1661,9 @@ data: "target": "" } ], - "thresholds": "80,90", + "thresholds": "80, 90", "title": "Pod Utilization", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -1836,15 +1676,13 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "Dashboard Row", + "title": "New Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -1882,7 +1720,8 @@ data: "timezone": "browser", "title": "Kubernetes Capacity Planning", "version": 4 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -1899,50 +1738,29 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, "editable": true, - "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], + "refresh": "10s", "rows": [ { "collapse": false, - "height": 254, + "editable": true, + "height": "254px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -1951,6 +1769,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -1959,8 +1778,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 1, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -1975,7 +1794,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -1994,7 +1812,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", @@ -2005,8 +1822,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Control Plane Components Down", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2024,7 +1842,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2033,7 +1850,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2042,8 +1859,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 2, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2058,7 +1875,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2077,7 +1893,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", @@ -2088,8 +1903,9 @@ data: "step": 600 } ], - "thresholds": "3,5", + "thresholds": "1, 3", "title": "Alerts Firing", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2102,7 +1918,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2111,7 +1926,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2120,8 +1935,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 3, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2136,7 +1951,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2155,7 +1969,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", @@ -2166,8 +1979,9 @@ data: "step": 600 } ], - "thresholds": "3,5", + "thresholds": "3, 5", "title": "Alerts Pending", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2180,7 +1994,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2189,7 +2002,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2198,8 +2011,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 4, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2214,7 +2027,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2233,7 +2045,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", @@ -2244,8 +2055,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Crashlooping Pods", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2258,19 +2070,16 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "Dashboard Row", + "title": "Row", "titleSize": "h6" }, { "collapse": false, - "height": 250, + "editable": true, + "height": "250px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2279,7 +2088,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2288,8 +2097,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 5, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2304,7 +2113,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2323,7 +2131,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", @@ -2334,8 +2141,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Node Not Ready", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2348,7 +2156,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2357,7 +2164,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2366,8 +2173,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 6, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2382,7 +2189,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2401,7 +2207,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", @@ -2412,8 +2217,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Node Disk Pressure", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2426,7 +2232,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2435,7 +2240,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2444,8 +2249,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 7, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2460,7 +2265,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2479,7 +2283,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", @@ -2490,8 +2293,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Node Memory Pressure", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2504,7 +2308,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2513,7 +2316,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2522,8 +2325,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 8, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2538,7 +2341,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -2557,7 +2359,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(kube_node_spec_unschedulable)", @@ -2568,8 +2369,9 @@ data: "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Nodes Unschedulable", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -2582,15 +2384,13 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "Dashboard Row", + "title": "Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -2625,10 +2425,11 @@ data: "30d" ] }, - "timezone": "", + "timezone": "browser", "title": "Kubernetes Cluster Health", "version": 9 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -2645,50 +2446,27 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, "editable": true, - "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], "rows": [ { "collapse": false, - "height": 129, + "height": "129px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2697,6 +2475,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2706,7 +2485,6 @@ data: "thresholdMarkers": true }, "id": 5, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2721,8 +2499,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -2740,18 +2516,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Control Plane UP", "type": "singlestat", "valueFontSize": "80%", @@ -2765,7 +2539,6 @@ data: "valueName": "total" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -2774,6 +2547,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -2783,7 +2557,6 @@ data: "thresholdMarkers": true }, "id": 6, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2798,8 +2571,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -2817,18 +2588,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "3,5", + "thresholds": "3, 5", "title": "Alerts Firing", "type": "singlestat", "valueFontSize": "80%", @@ -2842,19 +2611,15 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": true, "title": "Cluster Health", "titleSize": "h6" }, { "collapse": false, - "height": 168, + "height": "168px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -2863,7 +2628,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -2873,7 +2638,6 @@ data: "thresholdMarkers": true }, "id": 1, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2888,8 +2652,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -2907,19 +2669,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", "format": "time_series", - "interval": "", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "50,80", + "thresholds": "50, 80", "title": "API Servers UP", "type": "singlestat", "valueFontSize": "80%", @@ -2933,7 +2692,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -2942,7 +2700,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -2952,7 +2710,6 @@ data: "thresholdMarkers": true }, "id": 2, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -2967,8 +2724,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -2986,19 +2741,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", "format": "time_series", - "interval": "", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "50,80", + "thresholds": "50, 80", "title": "Controller Managers UP", "type": "singlestat", "valueFontSize": "80%", @@ -3012,7 +2764,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3021,7 +2772,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3031,7 +2782,6 @@ data: "thresholdMarkers": true }, "id": 3, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3046,8 +2796,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3065,19 +2813,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", "format": "time_series", - "interval": "", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "50,80", + "thresholds": "50, 80", "title": "Schedulers UP", "type": "singlestat", "valueFontSize": "80%", @@ -3091,7 +2836,6 @@ data: "valueName": "current" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ @@ -3100,7 +2844,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "none", "gauge": { "maxValue": 100, @@ -3109,9 +2853,7 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, - "hideTimeOverride": false, "id": 4, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3126,8 +2868,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3145,19 +2885,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", "format": "time_series", - "interval": "", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "1,3", + "thresholds": "1, 3", "title": "Crashlooping Control Plane Pods", "type": "singlestat", "valueFontSize": "80%", @@ -3171,19 +2908,15 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": true, "title": "Control Plane Status", "titleSize": "h6" }, { "collapse": false, - "height": 158, + "height": "158px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3192,6 +2925,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3201,7 +2935,6 @@ data: "thresholdMarkers": true }, "id": 8, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3216,8 +2949,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3235,18 +2966,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "80,90", + "thresholds": "80, 90", "title": "CPU Utilization", "type": "singlestat", "valueFontSize": "80%", @@ -3260,7 +2989,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3269,6 +2997,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3278,7 +3007,6 @@ data: "thresholdMarkers": true }, "id": 7, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3293,8 +3021,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3312,18 +3038,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "80,90", + "thresholds": "80, 90", "title": "Memory Utilization", "type": "singlestat", "valueFontSize": "80%", @@ -3337,7 +3061,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3346,6 +3069,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3355,7 +3079,6 @@ data: "thresholdMarkers": true }, "id": 9, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3370,8 +3093,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3389,18 +3110,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "80,90", + "thresholds": "80, 90", "title": "Filesystem Utilization", "type": "singlestat", "valueFontSize": "80%", @@ -3414,7 +3133,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3423,6 +3141,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3432,7 +3151,6 @@ data: "thresholdMarkers": true }, "id": 10, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3447,8 +3165,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, - "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -3466,18 +3182,16 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", "refId": "A", "step": 600 } ], - "thresholds": "80,90", + "thresholds": "80, 90", "title": "Pod Utilization", "type": "singlestat", "valueFontSize": "80%", @@ -3491,15 +3205,13 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": true, - "title": "Capacity Planing", + "title": "Capacity Planning", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -3534,10 +3246,11 @@ data: "30d" ] }, - "timezone": "", + "timezone": "browser", "title": "Kubernetes Cluster Status", "version": 3 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -3554,56 +3267,28 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, "editable": true, - "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], "rows": [ { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3612,6 +3297,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3620,8 +3306,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 1, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3636,7 +3322,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -3655,7 +3340,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", @@ -3665,8 +3349,9 @@ data: "step": 600 } ], - "thresholds": "50,80", + "thresholds": "50, 80", "title": "API Servers UP", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -3679,7 +3364,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3688,6 +3372,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3696,8 +3381,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 2, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3712,7 +3397,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -3731,7 +3415,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", @@ -3741,8 +3424,9 @@ data: "step": 600 } ], - "thresholds": "50,80", - "title": "Controller Mangers UP", + "thresholds": "50, 80", + "title": "Controller Managers UP", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -3755,7 +3439,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3764,6 +3447,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3772,8 +3456,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 3, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3788,7 +3472,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -3807,7 +3490,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", @@ -3817,8 +3499,9 @@ data: "step": 600 } ], - "thresholds": "50,80", + "thresholds": "50, 80", "title": "Schedulers UP", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -3831,7 +3514,6 @@ data: "valueName": "avg" }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -3840,6 +3522,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -3848,8 +3531,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 4, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -3864,7 +3547,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -3883,7 +3565,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", @@ -3894,8 +3575,9 @@ data: "step": 600 } ], - "thresholds": "5,10", + "thresholds": "5, 10", "title": "API Server Request Error Rate", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -3908,16 +3590,14 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, - "height": 250, + "editable": true, + "height": "250px", "panels": [ { "aliasColors": {}, @@ -3925,16 +3605,26 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 7, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 1, @@ -3959,53 +3649,41 @@ data: "step": 30 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "API Server Request Latency", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, - "height": 250, + "editable": true, + "height": "250px", "panels": [ { "aliasColors": {}, @@ -4013,16 +3691,26 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 5, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 1, @@ -4046,38 +3734,28 @@ data: "step": 60 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "End to end scheduling latency", + "title": "End to End Scheduling Latency", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "dtdurations", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] @@ -4088,16 +3766,26 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 6, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 1, @@ -4130,52 +3818,40 @@ data: "step": 60 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "API Server Request Rates", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -4210,10 +3886,11 @@ data: "30d" ] }, - "timezone": "", + "timezone": "browser", "title": "Kubernetes Control Plane Status", "version": 3 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -4230,54 +3907,27 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, - "description": "", "editable": true, - "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], + "refresh": false, "rows": [ { "collapse": false, - "height": "300", + "editable": true, + "height": "300px", "panels": [ { "aliasColors": {}, @@ -4286,16 +3936,26 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 1, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 1, @@ -4321,26 +3981,23 @@ data: }, { "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "hide": false, "intervalFactor": 2, "legendFormat": "Requested CPU Cores", "refId": "B", "step": 20 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "CPU Cores", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -4349,22 +4006,16 @@ data: "format": "short", "label": "CPU Cores", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -4373,7 +4024,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -4382,8 +4033,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 2, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -4398,7 +4049,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -4417,7 +4067,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": true }, - "tableColumn": "", "targets": [ { "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", @@ -4429,6 +4078,7 @@ data: ], "thresholds": "80, 90", "title": "CPU Cores", + "transparent": false, "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ @@ -4441,16 +4091,14 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "CPU Cores", "titleSize": "h6" }, { "collapse": false, - "height": "300", + "editable": true, + "height": "300px", "panels": [ { "aliasColors": {}, @@ -4459,16 +4107,26 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": true, + "error": false, "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 3, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 1, @@ -4494,26 +4152,23 @@ data: }, { "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "hide": false, "intervalFactor": 2, "legendFormat": "Requested Memory", "refId": "B", "step": 20 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Memory", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -4522,22 +4177,16 @@ data: "format": "bytes", "label": "Memory", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -4546,7 +4195,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "decimals": null, + "editable": true, "format": "percent", "gauge": { "maxValue": 100, @@ -4555,8 +4204,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 4, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -4571,7 +4220,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -4590,7 +4238,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": true }, - "tableColumn": "", "targets": [ { "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", @@ -4602,6 +4249,7 @@ data: ], "thresholds": "80, 90", "title": "Memory", + "transparent": false, "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ @@ -4614,15 +4262,13 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "Memory", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -4660,7 +4306,8 @@ data: "timezone": "browser", "title": "Kubernetes Resource Requests", "version": 2 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -4677,38 +4324,12 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { @@ -4719,16 +4340,15 @@ data: "gnetId": 22, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -4737,16 +4357,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 3, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -4771,10 +4398,7 @@ data: "step": 50 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Idle cpu", + "title": "Idle CPU", "tooltip": { "msResolution": false, "shared": true, @@ -4783,9 +4407,7 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -4800,16 +4422,12 @@ data: }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -4818,16 +4436,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 9, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -4868,10 +4493,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System load", + "title": "System Load", "tooltip": { "msResolution": false, "shared": true, @@ -4880,45 +4502,34 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -4927,8 +4538,12 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 4, + "isNew": false, "legend": { "alignAsTable": false, "avg": false, @@ -4939,8 +4554,7 @@ data: "min": false, "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -4997,10 +4611,7 @@ data: "step": 10 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory usage", + "title": "Memory Usage", "tooltip": { "msResolution": false, "shared": true, @@ -5009,33 +4620,25 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -5045,7 +4648,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "percent", "gauge": { "maxValue": 100, @@ -5054,8 +4656,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 5, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -5070,7 +4672,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -5089,7 +4690,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", @@ -5100,7 +4700,8 @@ data: } ], "thresholds": "80, 90", - "title": "Memory usage", + "title": "Memory Usage", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5113,19 +4714,16 @@ data: "valueName": "avg" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -5134,16 +4732,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 6, + "isNew": true, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -5196,9 +4801,6 @@ data: "step": 20 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Disk I/O", "tooltip": { "msResolution": false, @@ -5208,33 +4810,24 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "ms", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ @@ -5244,7 +4837,6 @@ data: ], "datasource": "${DS_PROMETHEUS}", "editable": true, - "error": false, "format": "percentunit", "gauge": { "maxValue": 1, @@ -5253,8 +4845,8 @@ data: "thresholdLabels": false, "thresholdMarkers": true }, + "hideTimeOverride": false, "id": 7, - "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ @@ -5269,7 +4861,6 @@ data: ], "maxDataPoints": 100, "nullPointMode": "connected", - "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", @@ -5288,7 +4879,6 @@ data: "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "", "targets": [ { "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", @@ -5299,7 +4889,8 @@ data: } ], "thresholds": "0.75, 0.9", - "title": "Disk space usage", + "title": "Disk Space Usage", + "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5312,19 +4903,16 @@ data: "valueName": "current" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -5333,16 +4921,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 8, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -5354,7 +4949,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "transmitted ", + "alias": "transmitted", "yaxis": 2 } ], @@ -5373,10 +4968,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network received", + "title": "Network Received", "tooltip": { "msResolution": false, "shared": true, @@ -5385,33 +4977,24 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] }, { - "alerting": {}, "aliasColors": {}, "bars": false, "dashLength": 10, @@ -5420,16 +5003,23 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 10, + "isNew": false, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, + "rightSide": false, "show": true, - "total": false, - "values": false + "total": false }, "lines": true, "linewidth": 2, @@ -5441,7 +5031,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "transmitted ", + "alias": "transmitted", "yaxis": 2 } ], @@ -5460,10 +5050,7 @@ data: "target": "" } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network transmitted", + "title": "Network Transmitted", "tooltip": { "msResolution": false, "shared": true, @@ -5472,41 +5059,31 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -5565,7 +5142,8 @@ data: "timezone": "browser", "title": "Nodes", "version": 2 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -5582,46 +5160,26 @@ data: { "__inputs": [ { - "name": "DS_PROMETHEUS", - "label": "prometheus", "description": "", - "type": "datasource", + "label": "prometheus", + "name": "DS_PROMETHEUS", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.1" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { "list": [] }, "editable": true, - "gnetId": null, "graphTooltip": 1, "hideControls": false, - "id": null, "links": [], + "refresh": false, "rows": [ { "collapse": false, + "editable": true, "height": "250px", "panels": [ { @@ -5633,12 +5191,18 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 1, + "isNew": false, "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, "rightSide": true, @@ -5679,9 +5243,6 @@ data: "step": 20 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Memory Usage", "tooltip": { "msResolution": true, @@ -5691,41 +5252,31 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, "title": "Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { @@ -5737,12 +5288,18 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 2, + "isNew": false, "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, "rightSide": true, @@ -5765,16 +5322,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", + "expr": "sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "intervalFactor": 2, "legendFormat": "{{ container_name }}", "refId": "A", "step": 30 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": true, @@ -5784,41 +5338,31 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "Row", "titleSize": "h6" }, { "collapse": false, + "editable": true, "height": "250px", "panels": [ { @@ -5830,12 +5374,18 @@ data: "editable": true, "error": false, "fill": 1, - "grid": {}, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "id": 3, + "isNew": false, "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, "rightSide": true, @@ -5858,16 +5408,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", "refId": "A", "step": 30 } ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, "title": "Network I/O", "tooltip": { "msResolution": true, @@ -5877,41 +5424,31 @@ data: }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ] } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, "showTitle": false, - "title": "New row", + "title": "New Row", "titleSize": "h6" } ], "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { @@ -6010,7 +5547,8 @@ data: "timezone": "browser", "title": "Pods", "version": 1 - } , + } + , "inputs": [ { "name": "DS_PROMETHEUS", @@ -6029,4 +5567,3 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } ---- diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..fdc2b200 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/aknuds1/grafanalib.git@v0.4.0 \ No newline at end of file From e722d911bf92f1f476a8fb8cc2e565ad5eed68d9 Mon Sep 17 00:00:00 2001 From: Daniel Fenton Date: Tue, 31 Oct 2017 14:44:07 -0400 Subject: [PATCH 139/638] Update KOPSonAWS.md The AWS CLI SG commands are not necessary, kops creates an SG with these ports open to the correct SG by default --- docs/KOPSonAWS.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md index 0269f161..67213934 100644 --- a/docs/KOPSonAWS.md +++ b/docs/KOPSonAWS.md @@ -5,22 +5,8 @@ A running Kubernetes cluster created with [KOPS](https://github.com/kubernetes/kops). -These instructions have currently been tested with **topology=public** on AWS with KOPS 1.5.1 and Kubernetes 1.5.x +These instructions have currently been tested with **topology=public** on AWS with KOPS 1.7.1 and Kubernetes 1.7.x -## Open AWS Security Groups: -1. Open port 9100 on the masters security group to the nodes security group -1. Open ports 10250-10252 on the masters security group to the nodes security group. - -Example script below requires $AWS\_DEFAULT_PROFILE and [$NAME](https://github.com/kubernetes/kops/blob/master/docs/aws.md#prepare-local-environment) - -```bash -MASTER_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=masters.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) -NODES_SG=$(aws --profile ${AWS_DEFAULT_PROFILE} ec2 describe-security-groups --filters "Name=tag:Name,Values=nodes.$NAME" --query "SecurityGroups[*].GroupId[]" --output=text) -aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 9100 --source-group $NODES_SG -aws --profile ${AWS_DEFAULT_PROFILE} ec2 authorize-security-group-ingress --group-id $MASTER_SG --protocol tcp --port 10250-10252 --source-group $NODES_SG -``` - -## Adding kube-prometheus Following the instructions in the [README](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/README.md): Example: From 3efaab694f6aabdf22203fa2eb58b83ca697cd16 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 1 Nov 2017 11:51:13 +0100 Subject: [PATCH 140/638] *: cut v0.14.1 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index dc2a74cb..2ba8dab8 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.14.0 + image: quay.io/coreos/prometheus-operator:v0.14.1 name: prometheus-operator ports: - containerPort: 8080 From 85ddb3137ca3dee0d8a6f81eee71ab3f1b7c73fa Mon Sep 17 00:00:00 2001 From: Konstantinos Natsakis Date: Tue, 7 Nov 2017 16:44:05 +0200 Subject: [PATCH 141/638] kube-prometheus: add stateful sets dashboard --- assets/grafana/statefulset.dashboard.py | 440 ++++++++++++++++++++++++ 1 file changed, 440 insertions(+) create mode 100644 assets/grafana/statefulset.dashboard.py diff --git a/assets/grafana/statefulset.dashboard.py b/assets/grafana/statefulset.dashboard.py new file mode 100644 index 00000000..fb01ce17 --- /dev/null +++ b/assets/grafana/statefulset.dashboard.py @@ -0,0 +1,440 @@ +import sys +import os.path +sys.path.insert(0, os.path.dirname(__file__)) +from _grafanalib import * + + +dashboard = Dashboard( + title='Stateful Set', + version=1, + graphTooltip=1, + time=Time(start='now-6h'), + templating=Templating(list=[ + { + 'allValue': '.*', + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': 'Namespace', + 'multi': False, + 'name': 'statefulset_namespace', + 'options': [], + 'query': 'label_values(kube_statefulset_metadata_generation, ' + 'namespace)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': None, + 'tags': [], + 'tagsQuery': '', + 'type': 'query', + 'useTags': False, + }, + { + 'allValue': None, + 'current': {}, + 'datasource': '${DS_PROMETHEUS}', + 'hide': 0, + 'includeAll': False, + 'label': 'Stateful Set', + 'multi': False, + 'name': 'statefulset_name', + 'options': [], + 'query': 'label_values(kube_statefulset_metadata_generation' + '{namespace="$statefulset_namespace"}, statefulset)', + 'refresh': 1, + 'regex': '', + 'sort': 0, + 'tagValuesQuery': '', + 'tags': [], + 'tagsQuery': 'statefulset', + 'type': 'query', + 'useTags': False, + }, + ]), + rows=[ + Row(panels=[ + SingleStat( + title='CPU', + id=8, + gauge=Gauge(show=False), + postfix='cores', + span=4, + valueFontSize='110%', + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + targets=[ + { + 'expr': 'sum(rate(container_cpu_usage_seconds_total' + '{namespace=\"$statefulset_namespace\",pod_name=~\"' + '$statefulset_name.*\"}[3m]))', + }, + ], + ), + SingleStat( + title='Memory', + id=9, + postfix='GB', + prefixFontSize='80%', + gauge=Gauge(show=False), + span=4, + valueFontSize='110%', + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': 'sum(container_memory_usage_bytes{namespace=' + '\"$statefulset_namespace\",pod_name=~\"$' + 'statefulset_name.*\"}) / 1024^3', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 600, + }, + ], + ), + SingleStat( + title='Network', + format='Bps', + gauge=Gauge(thresholdMarkers=False), + id=7, + postfix='', + span=4, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + sparkline=SparkLine( + fillColor=(31, 118, 189, 0.18), + lineColor=(31, 120, 193), + show=True, + ), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'expr': 'sum(rate(container_network_transmit_' + 'bytes_total' + '{namespace=\"$statefulset_namespace\",pod_name=~\"' + '$statefulset_name.*\"}[3m])) + ' + 'sum(rate(container_network_receive_bytes_total' + '{namespace=\"$statefulset_namespace\",pod_name=~' + '\"$statefulset_name.*\"}[3m]))', + }, + ], + ), + ], + height=200, + ), + Row( + height=100, panels=[ + SingleStat( + title='Desired Replicas', + id=5, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + span=3, + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + targets=[ + { + 'metric': 'kube_statefulset_replicas', + 'expr': 'max(kube_statefulset_replicas' + '{statefulset="$statefulset_name",namespace=' + '"$statefulset_namespace"}) without ' + '(instance, pod)', + }, + ], + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + gauge=Gauge(thresholdMarkers=False, show=False), + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + ), + SingleStat( + title='Available Replicas', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(show=False), + id=6, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'min(kube_statefulset_status_replicas' + '{statefulset=\"$statefulset_name\",' + 'namespace=\"$statefulset_namespace\"}) without ' + '(instance, pod)', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + ), + SingleStat( + title='Observed Generation', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(), + id=3, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'max(kube_statefulset_status_observed_' + 'generation{statefulset=\"$statefulset_name\",' + 'namespace=\"$statefulset_namespace\"}) without ' + '(instance, pod)', + }, + ], + rangeMaps=[ + { + 'from': "null", + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + } + ], + ), + SingleStat( + title='Metadata Generation', + colors=[ + (245, 54, 54, 0.9), + (237, 129, 40, 0.89), + (50, 172, 45, 0.97), + ], + gauge=Gauge(show=False), + id=2, + mappingType=1, + mappingTypes=[ + { + 'name': 'value to text', + 'value': 1, + }, + { + 'name': 'range to text', + 'value': 2, + }, + ], + targets=[ + { + 'expr': 'max(kube_statefulset_metadata_generation' + '{statefulset=\"$statefulset_name\",namespace=\"' + '$statefulset_namespace\"}) without (instance, ' + 'pod)', + }, + ], + rangeMaps=[ + { + 'from': 'null', + 'text': 'N/A', + 'to': 'null', + }, + ], + span=3, + sparkline=SparkLine(), + valueMaps=[ + { + 'op': '=', + 'text': 'N/A', + 'value': 'null', + }, + ], + ), + ], + ), + Row( + height=350, panels=[ + Graph( + title='Replicas', + dashLength=10, + dashes=False, + id=1, + spaceLength=10, + targets=[ + { + 'expr': 'min(kube_statefulset_status_replicas' + '{statefulset=\"$statefulset_name\",' + 'namespace=\"$statefulset_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'available', + 'refId': 'B', + 'step': 30, + }, + { + 'expr': 'max(kube_statefulset_replicas' + '{statefulset=\"$statefulset_name\",namespace=\"' + '$statefulset_namespace\"}) without ' + '(instance, pod)', + 'legendFormat': 'desired', + 'refId': 'E', + 'step': 30, + } + ], + xAxis=XAxis(mode='time'), + yAxes=YAxes( + YAxis(min=None), + YAxis(format='short', min=None, show=False), + ), + ), + ] + ), + ], +) From 6b5681a0c198f3aff8e8d8d404f053edf732f134 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 8 Nov 2017 16:34:18 +0100 Subject: [PATCH 142/638] *: upgrade to Prometheus 2.0 --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 168daa34..08a71023 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v2.0.0-rc.1 + version: v2.0.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From b071ae02caebbcef9b3887b1e77ecdd52ff12898 Mon Sep 17 00:00:00 2001 From: Harry Lee Date: Thu, 9 Nov 2017 09:40:52 +0200 Subject: [PATCH 143/638] Update KOPSonAWS.md --- docs/KOPSonAWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md index 67213934..26080a9f 100644 --- a/docs/KOPSonAWS.md +++ b/docs/KOPSonAWS.md @@ -14,8 +14,7 @@ Example: ```bash git clone -b master https://github.com/coreos/prometheus-operator.git prometheus-operator-temp; cd prometheus-operator-temp/contrib/kube-prometheus -./hack/cluster-monitoring/deploy -kubectl -n kube-system create -f manifests/k8s/self-hosted/ +./hack/cluster-monitoring/self-hosted-deploy cd - rm -rf prometheus-operator-temp ``` From d80eaea23a67df452183284e9981b58c7cf6cea1 Mon Sep 17 00:00:00 2001 From: Konstantinos Natsakis Date: Thu, 9 Nov 2017 18:33:25 +0200 Subject: [PATCH 144/638] kube-prometheus: use StatefulSet for dashboard title --- assets/grafana/statefulset.dashboard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/grafana/statefulset.dashboard.py b/assets/grafana/statefulset.dashboard.py index fb01ce17..a28d70fd 100644 --- a/assets/grafana/statefulset.dashboard.py +++ b/assets/grafana/statefulset.dashboard.py @@ -5,7 +5,7 @@ from _grafanalib import * dashboard = Dashboard( - title='Stateful Set', + title='StatefulSet', version=1, graphTooltip=1, time=Time(start='now-6h'), @@ -37,7 +37,7 @@ dashboard = Dashboard( 'datasource': '${DS_PROMETHEUS}', 'hide': 0, 'includeAll': False, - 'label': 'Stateful Set', + 'label': 'StatefulSet', 'multi': False, 'name': 'statefulset_name', 'options': [], From 5febe75910551dc75f6edefda9d1a6526e5a5920 Mon Sep 17 00:00:00 2001 From: Konstantinos Natsakis Date: Thu, 9 Nov 2017 18:37:46 +0200 Subject: [PATCH 145/638] kube-prometheus: add StatefulSet dashboard to ConfigMap --- manifests/grafana/grafana-dashboards.yaml | 720 ++++++++++++++++++++++ 1 file changed, 720 insertions(+) diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index e76a1103..aff751da 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -5559,6 +5559,726 @@ data: ], "overwrite": true } + statefulset-dashboard.json: |+ + { + "dashboard": + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "height": "200px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "100px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_statefulset_replicas", + "refId": "A", + "step": 600 + } + ], + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "statefulset_namespace", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "StatefulSet", + "multi": false, + "name": "statefulset_name", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "statefulset", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "StatefulSet", + "version": 1 + } + , + "inputs": [ + { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "type": "datasource", + "value": "prometheus" + } + ], + "overwrite": true + } prometheus-datasource.json: |+ { "access": "proxy", From 245029da506106544b7f5158ff827fb0093571f4 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Fri, 10 Nov 2017 12:03:47 +0100 Subject: [PATCH 146/638] kube-prometheus: Improve README, recommending Tectonic Installer --- README.md | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index ac102bb8..a47d8f41 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,40 @@ # kube-prometheus -This repository collects Kubernetes manifests, dashboards, and alerting rules -combined with documentation and scripts to provide single-command deployments -of end-to-end Kubernetes cluster monitoring. +This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and +[Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) +combined with documentation and scripts to provide single-command deployments of end-to-end +Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) (Operator). ## Prerequisites -First, you need a running Kubernetes cluster. If you don't have one, follow the -instructions of [bootkube](https://github.com/kubernetes-incubator/bootkube) or -[minikube](https://github.com/kubernetes/minikube). Some sample contents of this +First, you need a running Kubernetes cluster. If you don't have one, we recommend you create one +with [Tectonic Installer](https://coreos.com/tectonic/docs/latest/). Despite the name, +Tectonic Installer gives you also the choice to create a barebones Kubernetes cluster, without +CoreOS' Tectonic technology. Otherwise, you can simply make use of +[bootkube](https://github.com/kubernetes-incubator/bootkube) or +[minikube](https://github.com/kubernetes/minikube) for local testing. Some sample contents of this repository are adapted to work with a [multi-node setup](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node) using [bootkube](https://github.com/kubernetes-incubator/bootkube). ## Monitoring Kubernetes -The manifests used here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), -which manages Prometheus servers and their configuration in a cluster. With a single command we can install +The manifests here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), +which manages Prometheus servers and their configuration in a cluster. With a single command we can +install * The Operator itself * The Prometheus [node_exporter](https://github.com/prometheus/node_exporter) * [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) * The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheus) based on which the Operator deploys a Prometheus setup * A Prometheus configuration covering monitoring of all Kubernetes core components and exporters -* A default set of alerting rules on the cluster component's health +* A default set of alerting rules on the cluster components' health * A Grafana instance serving dashboards on cluster metrics * A three node highly available Alertmanager cluster Simply run: ```bash -export KUBECONFIG= # defaults to "~/.kube/config" +export KUBECONFIG= # defaults to "~/.kube/config" hack/cluster-monitoring/deploy ``` @@ -47,11 +52,11 @@ hack/cluster-monitoring/teardown ## Monitoring custom services -The example manifests in [/manifests/examples/example-app](/contrib/kube-prometheus/manifests/examples/example-app) +The example manifests in [manifests/examples/example-app](/contrib/kube-prometheus/manifests/examples/example-app) deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/design.md#servicemonitor), which specifies how the example service should be monitored. -The Prometheus Operator will deploy and configure the desired Prometheus instance and continiously +The Prometheus Operator will deploy and configure the desired Prometheus instance and continuously manage its life cycle. ```bash @@ -71,21 +76,32 @@ hack/example-service-monitoring/teardown ## Dashboarding -The provided manifests deploy a Grafana instance serving dashboards provided via a ConfigMap. -To modify, delete, or add dashboards, the `grafana-dashboards` ConfigMap must be modified. +The provided manifests deploy a Grafana instance serving dashboards provided via ConfigMaps. +Said ConfigMaps are generated from Python scripts in assets/grafana, that all have the extension +.dashboard.py as they are loaded by the [grafanalib](https://github.com/aknuds1/grafanalib) +Grafana dashboard generator. Bear in mind that we are for now using a fork of grafanalib as +we needed to make extensive changes to it, in order to be able to generate our dashboards. We are +hoping to be able to consolidate our version with the original. + +As such, in order to make changes to the dashboard bundle, you need to change the \*.dashboard.py +files in assets/grafana, eventually add your own, and then run `make generate` in the +kube-prometheus root directory. + +To read more in depth about developing dashboards, read the [Developing alerts and dashboards](docs/developing-alerts-and-dashboards.md) documentation. + +### Reloading of dashboards Currently, Grafana does not support serving dashboards from static files. Instead, the `grafana-watcher` sidecar container aims to emulate the behavior, by keeping the Grafana database always in sync with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. -For information about how to update/handle the dashboards check [Developing alerts and dashboards](docs/developing-alerts-and-dashboards.md) doc. - In the future, a separate Grafana operator will support gathering dashboards from multiple ConfigMaps based on label selection. WARNING: If you deploy multiple Grafana instances for HA, you must use session affinity. -Otherwise if pods restart the prometheus datasource ID can get out of sync between the pods, breaking the UI +Otherwise if pods restart the prometheus datasource ID can get out of sync between the pods, +breaking the UI ## Roadmap From 7d693b543363f4e820241f84a952e99e9965a8ea Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Fri, 10 Nov 2017 12:03:55 +0100 Subject: [PATCH 147/638] kube-prometheus: Update development docs --- README.md | 4 ++- docs/developing-alerts-and-dashboards.md | 32 ++++++++++++++---------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index a47d8f41..c8236bf5 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,9 @@ As such, in order to make changes to the dashboard bundle, you need to change th files in assets/grafana, eventually add your own, and then run `make generate` in the kube-prometheus root directory. -To read more in depth about developing dashboards, read the [Developing alerts and dashboards](docs/developing-alerts-and-dashboards.md) documentation. +To read more in depth about developing dashboards, read the +[Developing Prometheus Rules and Grafana Dashboards](docs/developing-alerts-and-dashboards.md) +documentation. ### Reloading of dashboards diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md index bfba9f0d..ed3a2a06 100644 --- a/docs/developing-alerts-and-dashboards.md +++ b/docs/developing-alerts-and-dashboards.md @@ -1,35 +1,41 @@ -# Developing Alerts and Dashboards +# Developing Prometheus Rules and Grafana Dashboards -`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added. +`kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this. -For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. +For both the Prometheus rules and the Grafana dashboards there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. Note: `make generate` should be executed from kube-prometheus base directory. -## Alerts +## Prometheus Rules -The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. +The `ConfigMap` that is generated and holds the Prometheus rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. -It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`. +It is generated from all the `*.rules.yaml` files in the `assets/prometheus/rules/` directory. -To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest. +To extend the rules simply add a new `.rules.yaml` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules.yaml` file and re-generate the manifest. Then the generated manifest can be applied against a Kubernetes cluster. ## Dashboards -The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. +The generated `ConfigMap`s holding the Grafana dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. -As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. +The dashboards themselves get generated from Python scripts: assets/grafana/\*.dashboard.py. +These scripts are loaded by the [grafanalib](https://github.com/aknuds1/grafanalib) +Grafana dashboard generator, which turns them into dashboards. -To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests (executing `make generate` from kube-prometheus base directory). +Bear in mind that we are for now using a fork of grafanalib as we needed to make extensive +changes to it, in order to be able to generate our dashboards. We are hoping to be able to +consolidate our version with the original. -Note: The dashboard json file to be copied in `assets/grafana/` should be suffixed with `-dashboard.json`, otherwise it won't be processed by `make generate`. +After changing grafanalib scripts in assets/grafana, or adding your own, you'll have to run +`make generate` in the kube-prometheus root directory in order to re-generate the dashboards +manifest. You can deploy the latter with kubectl similar to the following: -Then the generated manifest can be applied against a Kubernetes cluster with something like: ``` kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml ``` -That will update the ConfigMap `grafana-dashboards`. Change should be automatically detected by grafana-watcher and dashboards reloaded. + +This should cause Grafana to re-load its dashboards automatically. From b65ed3ba6018633faae7d1bc2ea502e9cfbcdc99 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 10 Nov 2017 10:33:32 +0100 Subject: [PATCH 148/638] kube-prometheus: add custom-metrics-api example --- manifests/custom-metrics-api/.gitignore | 7 ++++ manifests/custom-metrics-api/README.md | 11 +++++ ...r-auth-delegator-cluster-role-binding.yaml | 12 ++++++ ...cs-apiserver-auth-reader-role-binding.yaml | 13 ++++++ .../custom-metrics-apiserver-deployment.yaml | 41 +++++++++++++++++++ ...-resource-reader-cluster-role-binding.yaml | 12 ++++++ ...tom-metrics-apiserver-service-account.yaml | 4 ++ .../custom-metrics-apiserver-service.yaml | 10 +++++ .../custom-metrics-apiservice.yaml | 13 ++++++ .../custom-metrics-cluster-role.yaml | 9 ++++ ...-metrics-resource-reader-cluster-role.yaml | 14 +++++++ manifests/custom-metrics-api/deploy.sh | 13 ++++++ manifests/custom-metrics-api/gencerts.sh | 21 ++++++++++ ...a-custom-metrics-cluster-role-binding.yaml | 12 ++++++ manifests/custom-metrics-api/teardown.sh | 13 ++++++ 15 files changed, 205 insertions(+) create mode 100644 manifests/custom-metrics-api/.gitignore create mode 100644 manifests/custom-metrics-api/README.md create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-apiservice.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-cluster-role.yaml create mode 100644 manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml create mode 100755 manifests/custom-metrics-api/deploy.sh create mode 100755 manifests/custom-metrics-api/gencerts.sh create mode 100644 manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml create mode 100755 manifests/custom-metrics-api/teardown.sh diff --git a/manifests/custom-metrics-api/.gitignore b/manifests/custom-metrics-api/.gitignore new file mode 100644 index 00000000..794c008c --- /dev/null +++ b/manifests/custom-metrics-api/.gitignore @@ -0,0 +1,7 @@ +apiserver-key.pem +apiserver.csr +apiserver.pem +metrics-ca-config.json +metrics-ca.crt +metrics-ca.key +cm-adapter-serving-certs.yaml diff --git a/manifests/custom-metrics-api/README.md b/manifests/custom-metrics-api/README.md new file mode 100644 index 00000000..91375a42 --- /dev/null +++ b/manifests/custom-metrics-api/README.md @@ -0,0 +1,11 @@ +# Custom Metrics API + +The custom metrics API allows the HPA v2 to scale on arbirary metrics. + +This directory contains an example deployment of the custom metrics API adapter using Prometheus as the backing monitoring system. + +In order to deploy the custom metrics adapter for Prometheus you need to generate TLS certficates used to serve the API. An example of how these could be generated can be found in `./gencerts.sh`, note that this is _not_ recommended to be used in production. You need to employ a secure PKI strategy, this is merely an example to get started and try it out quickly. + +Once the generated `Secret` with the certificates is in place, you can deploy everything in the `monitoring` namespace using `./deploy.sh`. + +When you're done, you can teardown using the `./teardown.sh` script. diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml new file mode 100644 index 00000000..8853bc1f --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: custom-metrics-apiserver + namespace: monitoring diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml new file mode 100644 index 00000000..682143cf --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: custom-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: custom-metrics-apiserver + namespace: monitoring diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml new file mode 100644 index 00000000..e5b4beea --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: custom-metrics-apiserver + name: custom-metrics-apiserver +spec: + replicas: 1 + selector: + matchLabels: + app: custom-metrics-apiserver + template: + metadata: + labels: + app: custom-metrics-apiserver + name: custom-metrics-apiserver + spec: + serviceAccountName: custom-metrics-apiserver + containers: + - name: custom-metrics-apiserver + image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.2.0 + args: + - /adapter + - --secure-port=6443 + - --tls-cert-file=/var/run/serving-cert/serving.crt + - --tls-private-key-file=/var/run/serving-cert/serving.key + - --logtostderr=true + - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ + - --metrics-relist-interval=30s + - --rate-interval=5m + - --v=10 + ports: + - containerPort: 6443 + volumeMounts: + - mountPath: /var/run/serving-cert + name: volume-serving-cert + readOnly: true + volumes: + - name: volume-serving-cert + secret: + secretName: cm-adapter-serving-certs diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml new file mode 100644 index 00000000..0335c177 --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics-resource-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-resource-reader +subjects: +- kind: ServiceAccount + name: custom-metrics-apiserver + namespace: monitoring diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml new file mode 100644 index 00000000..29359409 --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml @@ -0,0 +1,4 @@ +kind: ServiceAccount +apiVersion: v1 +metadata: + name: custom-metrics-apiserver diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml b/manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml new file mode 100644 index 00000000..fb0addcb --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: custom-metrics-apiserver +spec: + ports: + - port: 443 + targetPort: 6443 + selector: + app: custom-metrics-apiserver diff --git a/manifests/custom-metrics-api/custom-metrics-apiservice.yaml b/manifests/custom-metrics-api/custom-metrics-apiservice.yaml new file mode 100644 index 00000000..cfc2ee63 --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-apiservice.yaml @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io +spec: + service: + name: custom-metrics-apiserver + namespace: monitoring + group: custom.metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 diff --git a/manifests/custom-metrics-api/custom-metrics-cluster-role.yaml b/manifests/custom-metrics-api/custom-metrics-cluster-role.yaml new file mode 100644 index 00000000..003f0bf1 --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-cluster-role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: custom-metrics-server-resources +rules: +- apiGroups: + - custom.metrics.k8s.io + resources: ["*"] + verbs: ["*"] diff --git a/manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml b/manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml new file mode 100644 index 00000000..a5ad7604 --- /dev/null +++ b/manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: custom-metrics-resource-reader +rules: +- apiGroups: + - "" + resources: + - namespaces + - pods + - services + verbs: + - get + - list diff --git a/manifests/custom-metrics-api/deploy.sh b/manifests/custom-metrics-api/deploy.sh new file mode 100755 index 00000000..2255c7fd --- /dev/null +++ b/manifests/custom-metrics-api/deploy.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +kubectl create -f custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml +kubectl create -f custom-metrics-apiserver-auth-reader-role-binding.yaml +kubectl -n monitoring create -f cm-adapter-serving-certs.yaml +kubectl -n monitoring create -f custom-metrics-apiserver-deployment.yaml +kubectl create -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl -n monitoring create -f custom-metrics-apiserver-service-account.yaml +kubectl -n monitoring create -f custom-metrics-apiserver-service.yaml +kubectl create -f custom-metrics-apiservice.yaml +kubectl create -f custom-metrics-cluster-role.yaml +kubectl create -f custom-metrics-resource-reader-cluster-role.yaml +kubectl create -f hpa-custom-metrics-cluster-role-binding.yaml diff --git a/manifests/custom-metrics-api/gencerts.sh b/manifests/custom-metrics-api/gencerts.sh new file mode 100755 index 00000000..312ce74c --- /dev/null +++ b/manifests/custom-metrics-api/gencerts.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +go get -v -u github.com/cloudflare/cfssl/cmd/... + +export PURPOSE=metrics +openssl req -x509 -sha256 -new -nodes -days 365 -newkey rsa:2048 -keyout ${PURPOSE}-ca.key -out ${PURPOSE}-ca.crt -subj "/CN=ca" +echo '{"signing":{"default":{"expiry":"43800h","usages":["signing","key encipherment","'${PURPOSE}'"]}}}' > "${PURPOSE}-ca-config.json" + +export SERVICE_NAME=custom-metrics-apiserver +export ALT_NAMES='"custom-metrics-apiserver.monitoring","custom-metrics-apiserver.monitoring.svc"' +echo '{"CN":"'${SERVICE_NAME}'","hosts":['${ALT_NAMES}'],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=metrics-ca.crt -ca-key=metrics-ca.key -config=metrics-ca-config.json - | cfssljson -bare apiserver + +cat <<-EOF > cm-adapter-serving-certs.yaml +apiVersion: v1 +kind: Secret +metadata: + name: cm-adapter-serving-certs +data: + serving.crt: $(cat apiserver.pem | base64 --wrap=0) + serving.key: $(cat apiserver-key.pem | base64 --wrap=0) +EOF diff --git a/manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml b/manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml new file mode 100644 index 00000000..530ebea5 --- /dev/null +++ b/manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: hpa-controller-custom-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-server-resources +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system diff --git a/manifests/custom-metrics-api/teardown.sh b/manifests/custom-metrics-api/teardown.sh new file mode 100755 index 00000000..4797de1c --- /dev/null +++ b/manifests/custom-metrics-api/teardown.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +kubectl delete -f custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml +kubectl delete -f custom-metrics-apiserver-auth-reader-role-binding.yaml +kubectl -n monitoring delete -f cm-adapter-serving-certs.yaml +kubectl -n monitoring delete -f custom-metrics-apiserver-deployment.yaml +kubectl delete -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl -n monitoring delete -f custom-metrics-apiserver-service-account.yaml +kubectl -n monitoring delete -f custom-metrics-apiserver-service.yaml +kubectl delete -f custom-metrics-apiservice.yaml +kubectl delete -f custom-metrics-cluster-role.yaml +kubectl delete -f custom-metrics-resource-reader-cluster-role.yaml +kubectl delete -f hpa-custom-metrics-cluster-role-binding.yaml From 3a2dd05da48a27fb275b508a2d986e6f4947ba78 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 10 Nov 2017 12:00:53 +0100 Subject: [PATCH 149/638] kube-prometheus: add metrics-server example --- manifests/metrics-server/auth-delegator.yaml | 12 +++++++++ manifests/metrics-server/auth-reader.yaml | 13 ++++++++++ .../metrics-server/metrics-apiservice.yaml | 13 ++++++++++ .../metrics-server-cluster-role-binding.yaml | 12 +++++++++ .../metrics-server-cluster-role.yaml | 23 +++++++++++++++++ .../metrics-server-deployment.yaml | 25 +++++++++++++++++++ .../metrics-server-service-account.yaml | 5 ++++ .../metrics-server-service.yaml | 14 +++++++++++ 8 files changed, 117 insertions(+) create mode 100644 manifests/metrics-server/auth-delegator.yaml create mode 100644 manifests/metrics-server/auth-reader.yaml create mode 100644 manifests/metrics-server/metrics-apiservice.yaml create mode 100644 manifests/metrics-server/metrics-server-cluster-role-binding.yaml create mode 100644 manifests/metrics-server/metrics-server-cluster-role.yaml create mode 100644 manifests/metrics-server/metrics-server-deployment.yaml create mode 100644 manifests/metrics-server/metrics-server-service-account.yaml create mode 100644 manifests/metrics-server/metrics-server-service.yaml diff --git a/manifests/metrics-server/auth-delegator.yaml b/manifests/metrics-server/auth-delegator.yaml new file mode 100644 index 00000000..04826aec --- /dev/null +++ b/manifests/metrics-server/auth-delegator.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: metrics-server:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/manifests/metrics-server/auth-reader.yaml b/manifests/metrics-server/auth-reader.yaml new file mode 100644 index 00000000..1ab6a6a3 --- /dev/null +++ b/manifests/metrics-server/auth-reader.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: metrics-server-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/manifests/metrics-server/metrics-apiservice.yaml b/manifests/metrics-server/metrics-apiservice.yaml new file mode 100644 index 00000000..a8860fbc --- /dev/null +++ b/manifests/metrics-server/metrics-apiservice.yaml @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io +spec: + service: + name: metrics-server + namespace: kube-system + group: metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 diff --git a/manifests/metrics-server/metrics-server-cluster-role-binding.yaml b/manifests/metrics-server/metrics-server-cluster-role-binding.yaml new file mode 100644 index 00000000..dc634345 --- /dev/null +++ b/manifests/metrics-server/metrics-server-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:metrics-server +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:metrics-server +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/manifests/metrics-server/metrics-server-cluster-role.yaml b/manifests/metrics-server/metrics-server-cluster-role.yaml new file mode 100644 index 00000000..6976f5ce --- /dev/null +++ b/manifests/metrics-server/metrics-server-cluster-role.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:metrics-server +rules: +- apiGroups: + - "" + resources: + - pods + - nodes + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - "extensions" + resources: + - deployments + verbs: + - get + - list + - watch diff --git a/manifests/metrics-server/metrics-server-deployment.yaml b/manifests/metrics-server/metrics-server-deployment.yaml new file mode 100644 index 00000000..386740da --- /dev/null +++ b/manifests/metrics-server/metrics-server-deployment.yaml @@ -0,0 +1,25 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: metrics-server + namespace: kube-system + labels: + k8s-app: metrics-server +spec: + selector: + matchLabels: + k8s-app: metrics-server + template: + metadata: + name: metrics-server + labels: + k8s-app: metrics-server + spec: + serviceAccountName: metrics-server + containers: + - name: metrics-server + image: gcr.io/google_containers/metrics-server-amd64:v0.2.0 + imagePullPolicy: Always + command: + - /metrics-server + - --source=kubernetes.summary_api:'' diff --git a/manifests/metrics-server/metrics-server-service-account.yaml b/manifests/metrics-server/metrics-server-service-account.yaml new file mode 100644 index 00000000..ee205aa6 --- /dev/null +++ b/manifests/metrics-server/metrics-server-service-account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metrics-server + namespace: kube-system diff --git a/manifests/metrics-server/metrics-server-service.yaml b/manifests/metrics-server/metrics-server-service.yaml new file mode 100644 index 00000000..974628e0 --- /dev/null +++ b/manifests/metrics-server/metrics-server-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: metrics-server + namespace: kube-system + labels: + kubernetes.io/name: "Metrics-server" +spec: + selector: + k8s-app: metrics-server + ports: + - port: 443 + protocol: TCP + targetPort: 443 From 598d6779cd999ffba73c35ba17494669ddfbc140 Mon Sep 17 00:00:00 2001 From: Aleksandar Topuzovic Date: Tue, 14 Nov 2017 14:36:22 +0000 Subject: [PATCH 150/638] Alert on daemonset problems * If any of the rules is active > 10m * If all daemonsets are not ready * If all daemonsets are not scheduled * If some are miss scheduled --- assets/prometheus/rules/kubelet.rules.yaml | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 1aa5f84c..03ea03da 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -47,3 +47,30 @@ groups: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: K8SDaemonSetsNotRunning + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not ready. + summary: Daemonsets are not ready + - alert: K8SDaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly From 8cbdac6a2081b8bb74f49b6f50b4eb37fe23505f Mon Sep 17 00:00:00 2001 From: Aleksandar Topuzovic Date: Tue, 14 Nov 2017 14:38:26 +0000 Subject: [PATCH 151/638] Add generated manifests * Generated by `make generate` in `contrib/kube-prometheus` --- .../prometheus/prometheus-k8s-rules.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 041c127b..3a2a53a0 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -314,6 +314,33 @@ data: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: K8SDaemonSetsNotRunning + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not ready. + summary: Daemonsets are not ready + - alert: K8SDaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly kubernetes.rules.yaml: |+ groups: - name: ./kubernetes.rules From 6bd725b3476dfb0f7bbbeb8f2f6f00b93a990439 Mon Sep 17 00:00:00 2001 From: "Cesarini, Daniele" Date: Tue, 14 Nov 2017 15:11:24 +0000 Subject: [PATCH 152/638] Fixing #751 K8SApiServerLatency always triggering --- manifests/prometheus/prometheus-k8s-rules.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 041c127b..d470c7bf 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -227,8 +227,8 @@ data: disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) - WITHOUT (instance, resource)) / 1e+06 > 1 + expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) + by (le)) / 1e+06 > 1 for: 10m labels: severity: warning From 727d053dd4beb5fc97e4d255ffbd58bdd680f35d Mon Sep 17 00:00:00 2001 From: "Cesarini, Daniele" Date: Tue, 14 Nov 2017 15:48:14 +0000 Subject: [PATCH 153/638] Fixing #751 K8SApiServerLatency always triggering --- assets/prometheus/rules/kube-apiserver.rules.yaml | 4 ++-- manifests/grafana/grafana-dashboards.yaml | 1 + manifests/prometheus/prometheus-k8s-rules.yaml | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml index 55ebe025..50982b05 100644 --- a/assets/prometheus/rules/kube-apiserver.rules.yaml +++ b/assets/prometheus/rules/kube-apiserver.rules.yaml @@ -11,8 +11,8 @@ groups: disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) - WITHOUT (instance, resource)) / 1e+06 > 1 + expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) + by (le)) / 1e+06 > 1 for: 10m labels: severity: warning diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index aff751da..a55a8df5 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -6287,3 +6287,4 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } +--- diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index d470c7bf..6be3ee49 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -227,8 +227,8 @@ data: disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) - by (le)) / 1e+06 > 1 + expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) + by (le)) / 1e+06 > 1 for: 10m labels: severity: warning From 0bfc3d0966d0ca9d4cddf1335a01c0dcd7ea7e0e Mon Sep 17 00:00:00 2001 From: stroebitzer Date: Fri, 17 Nov 2017 13:15:21 +0100 Subject: [PATCH 154/638] Fix docu concerning initial deployment After cloning the repo I wanted to deploy kube-prometheus into my running minikube. I did the steps mentioned in the docu which brought me to the following exception: namespace "monitoring" created error: the path "manifests/prometheus-operator" does not exist Waiting for Operator to register custom resource definitions.................................................................................................................^C --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c8236bf5..321179b8 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Simply run: ```bash export KUBECONFIG= # defaults to "~/.kube/config" -hack/cluster-monitoring/deploy +prometheus-operator/contrib/kube-prometheus/hack/cluster-monitoring/deploy ``` After all pods are ready, you can reach: From 9e77cc4bbdf3ab73e8617d7dede86751cfe8f3f5 Mon Sep 17 00:00:00 2001 From: stroebitzer Date: Tue, 21 Nov 2017 08:42:06 +0100 Subject: [PATCH 155/638] add hint to change to proper dir in readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 321179b8..804dc942 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ Simply run: ```bash export KUBECONFIG= # defaults to "~/.kube/config" -prometheus-operator/contrib/kube-prometheus/hack/cluster-monitoring/deploy +cd contrib/kube-prometheus/ +hack/cluster-monitoring/deploy ``` After all pods are ready, you can reach: From a37ad3a270ec153f3f0de0800464dcb32f326426 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 21 Nov 2017 16:37:50 +0100 Subject: [PATCH 156/638] kube-prometheus: sync rules --- .../prometheus/rules/alertmanager.rules.yaml | 7 +- assets/prometheus/rules/general.rules.yaml | 27 +-- .../rules/kube-apiserver.rules.yaml | 22 --- .../rules/kube-controller-manager.rules.yaml | 2 +- .../rules/kube-scheduler.rules.yaml | 47 ++++- .../rules/kube-state-metrics.rules.yaml | 55 ++++++ assets/prometheus/rules/kubelet.rules.yaml | 41 +--- assets/prometheus/rules/kubernetes.rules.yaml | 175 ++++++++---------- assets/prometheus/rules/node.rules.yaml | 57 +++--- assets/prometheus/rules/prometheus.rules.yaml | 42 ++++- 10 files changed, 261 insertions(+), 214 deletions(-) delete mode 100644 assets/prometheus/rules/kube-apiserver.rules.yaml create mode 100644 assets/prometheus/rules/kube-state-metrics.rules.yaml diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml index 8f65c5da..fdfdfd0f 100644 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./alertmanager.rules +- name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) @@ -11,7 +11,6 @@ groups: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. - summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -21,8 +20,7 @@ groups: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. - summary: Alertmanager down or not discovered - - alert: FailedReload + - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: @@ -30,4 +28,3 @@ groups: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - summary: Alertmanager configuration reload has failed diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml index 355e12f3..84ce6b47 100644 --- a/assets/prometheus/rules/general.rules.yaml +++ b/assets/prometheus/rules/general.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./general.rules +- name: general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 @@ -7,7 +7,7 @@ groups: labels: severity: warning annotations: - description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of {{ $labels.job }} targets are down.' summary: Targets are down - alert: DeadMansSwitch expr: vector(1) @@ -17,32 +17,23 @@ groups: description: This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional. summary: Alerting DeadMansSwitch - - alert: TooManyOpenFileDescriptors - expr: 100 * (process_open_fds / process_max_fds) > 95 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' - summary: too many open file descriptors - - record: instance:fd_utilization + - record: fd_utilization expr: process_open_fds / process_max_fds - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next 4 hours' summary: file descriptors soon exhausted - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + expr: predict_linear(fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml deleted file mode 100644 index 50982b05..00000000 --- a/assets/prometheus/rules/kube-apiserver.rules.yaml +++ /dev/null @@ -1,22 +0,0 @@ -groups: -- name: ./kube-apiserver.rules - rules: - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: Prometheus failed to scrape API server(s), or all API servers have - disappeared from service discovery. - summary: API server unreachable - - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) - by (le)) / 1e+06 > 1 - for: 10m - labels: - severity: warning - annotations: - description: 99th percentile Latency for {{ $labels.verb }} requests to the - kube-apiserver is higher than 1s. - summary: Kubernetes apiserver latency is high diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml index f23bbde3..4ea82ed1 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./kube-controller-manager.rules +- name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml index 0383b3b1..8f0c01fd 100644 --- a/assets/prometheus/rules/kube-scheduler.rules.yaml +++ b/assets/prometheus/rules/kube-scheduler.rules.yaml @@ -1,6 +1,51 @@ groups: -- name: ./kube-scheduler.rules +- name: kube-scheduler.rules rules: + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml new file mode 100644 index 00000000..32b99fa2 --- /dev/null +++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml @@ -0,0 +1,55 @@ +groups: +- name: kube-state-metrics.rules + rules: + - alert: DeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 15m + labels: + severity: warning + annotations: + description: Observed deployment generation does not match expected one for + deployment {{$labels.namespaces}}{{$labels.deployment}} + - alert: DeploymentReplicasNotUpdated + expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) + or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) + unless (kube_deployment_spec_paused == 1) + for: 15m + labels: + severity: warning + annotations: + description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + description: Only {{$value}}% of desired pods scheduled and ready for daemon + set {{$labels.namespaces}}/{{$labels.daemonset}} + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: DaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + times within the last hour diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 03ea03da..a1fc93cb 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./kubelet.rules +- name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 @@ -18,20 +18,17 @@ groups: labels: severity: critical annotations: - description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady - state).' - summary: Many Kubernetes nodes are Not Ready + description: '{{ $value }}% of Kubernetes nodes are not ready' - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown - expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) - > 0.1 + expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) + * 100 > 1 for: 1h labels: severity: critical @@ -41,36 +38,10 @@ groups: summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 + for: 10m labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: K8SDaemonSetsNotRunning - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not ready. - summary: Daemonsets are not ready - - alert: K8SDaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index ab5ccf06..f13d0088 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -1,115 +1,86 @@ groups: -- name: ./kubernetes.rules +- name: kubernetes.rules rules: - - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:spec_cpu_shares - expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:cpu_usage:rate - expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_working_set:bytes - expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_rss:bytes - expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_cache:bytes - expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:disk_usage:bytes - expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster_namespace_controller_pod_container:memory_oom:rate - expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster:memory_allocation:percent - expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) - / sum(machine_memory_bytes) BY (cluster) - - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) - - record: cluster:cpu_allocation:percent - expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} - * ON(cluster, instance) machine_cpu_cores) BY (cluster) - - record: cluster:node_cpu_use:percent - expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) - BY (cluster) - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: pod_name:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (pod_name) + - record: pod_name:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: pod_name:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + BY (pod_name) + - record: pod_name:container_fs_usage_bytes:sum + expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: namespace:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) + - record: namespace:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) + - record: namespace:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) + BY (namespace) + - record: cluster:memory_usage:ratio + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:container_spec_cpu_shares:ratio + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 + / sum(machine_cpu_cores) + - record: cluster:container_cpu_usage:ratio + expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + / sum(machine_cpu_cores) + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.99" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.9" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.5" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 1 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 4 + for: 10m labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 2 + for: 10m labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 5 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 20m labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" + severity: critical + annotations: + description: No API servers are reachable or all have disappeared from service + discovery diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 9c1641ca..0e7e1bbd 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -1,6 +1,23 @@ groups: -- name: ./node.rules +- name: node.rules rules: + - record: instance:node_cpu:rate:sum + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + BY (instance) + - record: instance:node_filesystem_usage:sum + expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - record: instance:node_cpu:ratio + expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu{mode!="idle"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m @@ -8,30 +25,20 @@ groups: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery. - summary: node-exporter cannot be scraped - - alert: K8SNodeOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + or node-exporters have disappeared from discovery + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + for: 30m + labels: + severity: warning + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + for: 10m labels: - service: k8s severity: critical annotations: - description: '{{ $labels.node }} has run out of disk space.' - summary: Node ran out of disk space. - - alert: K8SNodeMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == - 1 - labels: - service: k8s - severity: warning - annotations: - description: '{{ $labels.node }} is under memory pressure.' - summary: Node is under memory pressure. - - alert: K8SNodeDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 - labels: - service: k8s - severity: warning - annotations: - description: '{{ $labels.node }} is under disk pressure.' - summary: Node is under disk pressure. + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index 6ed0cd68..df51d010 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -1,12 +1,44 @@ groups: -- name: ./prometheus.rules +- name: prometheus.rules rules: - - alert: FailedReload + - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: - description: Reloading Prometheus' configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Prometheus configuration reload has failed + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull + expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity + for: 10m + labels: + severity: warning + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.03 + for: 10m + labels: + severity: critical + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 10m + labels: + severity: warning + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers From b421172bba843a2ad2961951eaa5bf37080a95d9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 21 Nov 2017 16:47:16 +0100 Subject: [PATCH 157/638] kube-prometheus: re-generate manifests --- .../prometheus/prometheus-k8s-rules.yaml | 477 ++++++++++-------- 1 file changed, 262 insertions(+), 215 deletions(-) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 3e6552c1..6493ff74 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -8,7 +8,7 @@ metadata: data: alertmanager.rules.yaml: |+ groups: - - name: ./alertmanager.rules + - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) @@ -20,7 +20,6 @@ data: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. - summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -30,8 +29,7 @@ data: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. - summary: Alertmanager down or not discovered - - alert: FailedReload + - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: @@ -39,7 +37,6 @@ data: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - summary: Alertmanager configuration reload has failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules @@ -166,7 +163,7 @@ data: summary: high commit durations general.rules.yaml: |+ groups: - - name: ./general.rules + - name: general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 @@ -174,7 +171,7 @@ data: labels: severity: warning annotations: - description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of {{ $labels.job }} targets are down.' summary: Targets are down - alert: DeadMansSwitch expr: vector(1) @@ -184,61 +181,29 @@ data: description: This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional. summary: Alerting DeadMansSwitch - - alert: TooManyOpenFileDescriptors - expr: 100 * (process_open_fds / process_max_fds) > 95 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' - summary: too many open file descriptors - - record: instance:fd_utilization + - record: fd_utilization expr: process_open_fds / process_max_fds - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next 4 hours' summary: file descriptors soon exhausted - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + expr: predict_linear(fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted - kube-apiserver.rules.yaml: |+ - groups: - - name: ./kube-apiserver.rules - rules: - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: Prometheus failed to scrape API server(s), or all API servers have - disappeared from service discovery. - summary: API server unreachable - - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) - by (le)) / 1e+06 > 1 - for: 10m - labels: - severity: warning - annotations: - description: 99th percentile Latency for {{ $labels.verb }} requests to the - kube-apiserver is higher than 1s. - summary: Kubernetes apiserver latency is high kube-controller-manager.rules.yaml: |+ groups: - - name: ./kube-controller-manager.rules + - name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) @@ -252,8 +217,53 @@ data: summary: Controller manager is down kube-scheduler.rules.yaml: |+ groups: - - name: ./kube-scheduler.rules + - name: kube-scheduler.rules rules: + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m @@ -264,9 +274,65 @@ data: to nodes. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler summary: Scheduler is down + kube-state-metrics.rules.yaml: |+ + groups: + - name: kube-state-metrics.rules + rules: + - alert: DeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 15m + labels: + severity: warning + annotations: + description: Observed deployment generation does not match expected one for + deployment {{$labels.namespaces}}{{$labels.deployment}} + - alert: DeploymentReplicasNotUpdated + expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) + or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) + unless (kube_deployment_spec_paused == 1) + for: 15m + labels: + severity: warning + annotations: + description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + description: Only {{$value}}% of desired pods scheduled and ready for daemon + set {{$labels.namespaces}}/{{$labels.daemonset}} + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: DaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + times within the last hour kubelet.rules.yaml: |+ groups: - - name: ./kubelet.rules + - name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 @@ -285,20 +351,17 @@ data: labels: severity: critical annotations: - description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady - state).' - summary: Many Kubernetes nodes are Not Ready + description: '{{ $value }}% of Kubernetes nodes are not ready' - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown - expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) - > 0.1 + expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) + * 100 > 1 for: 1h labels: severity: critical @@ -308,159 +371,121 @@ data: summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 + for: 10m labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: K8SDaemonSetsNotRunning - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not ready. - summary: Daemonsets are not ready - - alert: K8SDaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly kubernetes.rules.yaml: |+ groups: - - name: ./kubernetes.rules + - name: kubernetes.rules rules: - - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:spec_cpu_shares - expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:cpu_usage:rate - expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_working_set:bytes - expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_rss:bytes - expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_cache:bytes - expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:disk_usage:bytes - expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster_namespace_controller_pod_container:memory_oom:rate - expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster:memory_allocation:percent - expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) - / sum(machine_memory_bytes) BY (cluster) - - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) - - record: cluster:cpu_allocation:percent - expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} - * ON(cluster, instance) machine_cpu_cores) BY (cluster) - - record: cluster:node_cpu_use:percent - expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) - BY (cluster) - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: pod_name:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (pod_name) + - record: pod_name:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: pod_name:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + BY (pod_name) + - record: pod_name:container_fs_usage_bytes:sum + expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: namespace:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) + - record: namespace:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) + - record: namespace:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) + BY (namespace) + - record: cluster:memory_usage:ratio + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:container_spec_cpu_shares:ratio + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 + / sum(machine_cpu_cores) + - record: cluster:container_cpu_usage:ratio + expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + / sum(machine_cpu_cores) + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.99" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.9" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.5" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 1 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 4 + for: 10m labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 2 + for: 10m labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 5 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 20m labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" + severity: critical + annotations: + description: No API servers are reachable or all have disappeared from service + discovery node.rules.yaml: |+ groups: - - name: ./node.rules + - name: node.rules rules: + - record: instance:node_cpu:rate:sum + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + BY (instance) + - record: instance:node_filesystem_usage:sum + expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - record: instance:node_cpu:ratio + expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu{mode!="idle"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m @@ -468,43 +493,65 @@ data: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery. - summary: node-exporter cannot be scraped - - alert: K8SNodeOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + or node-exporters have disappeared from discovery + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + for: 30m + labels: + severity: warning + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + for: 10m labels: - service: k8s severity: critical annotations: - description: '{{ $labels.node }} has run out of disk space.' - summary: Node ran out of disk space. - - alert: K8SNodeMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == - 1 - labels: - service: k8s - severity: warning - annotations: - description: '{{ $labels.node }} is under memory pressure.' - summary: Node is under memory pressure. - - alert: K8SNodeDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 - labels: - service: k8s - severity: warning - annotations: - description: '{{ $labels.node }} is under disk pressure.' - summary: Node is under disk pressure. + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) prometheus.rules.yaml: |+ groups: - - name: ./prometheus.rules + - name: prometheus.rules rules: - - alert: FailedReload + - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: - description: Reloading Prometheus' configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Prometheus configuration reload has failed + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull + expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity + for: 10m + labels: + severity: warning + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.03 + for: 10m + labels: + severity: critical + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 10m + labels: + severity: warning + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers From 7c12e3379c6a086fbac0eff34012ac72794747a4 Mon Sep 17 00:00:00 2001 From: Antoine Legrand <2t.antoine@gmail.com> Date: Fri, 17 Nov 2017 14:31:05 +0100 Subject: [PATCH 158/638] *: cut v0.15.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 2ba8dab8..cafc2064 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.14.1 + image: quay.io/coreos/prometheus-operator:v0.15.0 name: prometheus-operator ports: - containerPort: 8080 From 40d746e12a96bd7eba2b3700318f58d6c95273c0 Mon Sep 17 00:00:00 2001 From: Nuala Gaffey Date: Wed, 22 Nov 2017 16:41:05 -0500 Subject: [PATCH 159/638] Promtool appends .yml, not .yaml, but for backwards compatibility, lets allow both --- hack/scripts/generate-rules-configmap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index 9eb2efc8..7a98962d 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -11,7 +11,7 @@ metadata: data: EOF -for f in assets/prometheus/rules/*.rules.yaml +for f in assets/prometheus/rules/*.rules.y*ml do echo " $(basename $f): |+" cat $f | sed "s/^/ /g" From 563e74f85a9222a69ac79baed2249fe97b1a1d66 Mon Sep 17 00:00:00 2001 From: loganasherjones Date: Thu, 23 Nov 2017 05:26:20 -0500 Subject: [PATCH 160/638] Documentation: Added kubeadm documentation #618 (#770) * Documentation: Added kubeadm documentation #618 This getting started guide covers a simple deployment of kube-prometheus and assumes you have deployed using kubeadm. Hopefully this helps clears up #618 * Moved documentation to appropriate folder, updated links. * Minor documentation editing Fixed referneces to Alertmanager. Slightly stronger wording on changing grafana user/pass --- docs/kube-prometheus-on-kubeadm.md | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/kube-prometheus-on-kubeadm.md diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md new file mode 100644 index 00000000..fac211db --- /dev/null +++ b/docs/kube-prometheus-on-kubeadm.md @@ -0,0 +1,148 @@ +
+ + +# Kube Prometheus on Kubeadm + +The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. Kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with Kubeadm. + +This guide assumes you have a basic understanding of how to use the functionality the Prometheus Operator implements. If you haven't yet, we recommend reading through the [getting started guide](getting-started.md) as well as the [alerting guide](../../../Documentation/user-guides/alerting.md). + +## Kubeadm Pre-requisites + +This guide assumes you have some familiarity with `kubeadm` or at least have deployed a cluster using `kubeadm`. By default, `kubeadm` does not expose two of the services that we will be monitoring. Therefore, in order to get the most out of the `kube-prometheus` package, we need to make some quick tweaks to the Kubernetes cluster. Since we will be monitoring the `kube-controller-manager` and `kube-scheduler`, we must expose them to the cluster. + +By default, `kubeadm` runs these pods on your master and bound to `127.0.0.1`. There are a couple of ways to change this. The recommended way to change these features is to use the [kubeadm config file](https://kubernetes.io/docs/reference/generated/kubeadm/#config-file). An example configuration file can be used: + +```yaml +apiVersion: kubeadm.k8s.io/v1alpha1 +kind: MasterConfiguration +api: + advertiseAddress: 192.168.1.173 + bindPort: 6443 +authorizationModes: +- Node +- RBAC +certificatesDir: /etc/kubernetes/pki +cloudProvider: +etcd: + dataDir: /var/lib/etcd + endpoints: null +imageRepository: gcr.io/google_containers +kubernetesVersion: v1.8.3 +networking: + dnsDomain: cluster.local + serviceSubnet: 10.96.0.0/12 +nodeName: your-dev +tokenTTL: 24h0m0s +controllerManagerExtraArgs: + address: 0.0.0.0 +schedulerExtraArgs: + address: 0.0.0.0 +``` + +Notice the `schedulerExtraArgs` and `controllerManagerExtraArgs`. This exposes the `kube-controller-manager` and `kube-scheduler` services to the rest of the cluster. + +In addition, we will be using `node-exporter` to monitor the `cAdvisor` service on all the nodes. This, however requires a change to the `kubelet` service on the master as well as all the nodes. According to the Kubernetes documentation + +> The kubeadm deb package ships with configuration for how the kubelet should be run. Note that the `kubeadm` CLI command will never touch this drop-in file. This drop-in file belongs to the kubeadm deb/rpm package. + +Again, we need to expose the `cadvisor` that is installed and managed by the `kubelet` daemon. To do so, we do the following on all the masters and nodes: + +``` +sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +systemctl daemon-reload +systemctl restart kubelet +``` + +With these changes, your Kubernetes cluster is ready. + +## Metric Sources + +Monitoring a Kubernetes cluster with Prometheus is a natural choice as Kubernetes components themselves are instrumented with Prometheus metrics, therefore those components simply have to be discovered by Prometheus and most of the cluster is monitored. + +Metrics that are rather about cluster state than a single component's metrics is exposed by the add-on component [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics). + +Additionally, to have an overview of cluster nodes' resources the Prometheus [node_exporter](https://github.com/prometheus/node_exporter) is used. The node_exporter allows monitoring a node's resources: CPU, memory and disk utilization and more. + +Once you complete this guide you will monitor the following: + +* cluster state via kube-state-metrics +* nodes via the node_exporter +* kubelets +* apiserver +* kube-scheduler +* kube-controller-manager + + +## Getting Up and Running Fast with Kube-Prometheus + +To help get started more quickly with monitoring Kubernetes clusters, [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) was created. It is a collection of manifests including dashboards and alerting rules that can easily be deployed. It utilizes the Prometheus Operator and all the manifests demonstrated in [this guide](../../../Documentation/user-guides/cluster-monitoring.md). + +This section represent a quick installation and is not intended to teach you about all the components. The easiest way to get started is to clone this repository and use the `kube-prometheus` section of the code. + +``` +git clone https://github.com/coreos/prometheus-operator +cd prometheus-operator/contrib/kube-prometheus/ +``` + +First, create the namespace in which you want the monitoring tool suite to be running. + +``` +export NAMESPACE='monitoring' +kubectl create namespace "$NAMESPACE" +``` + +Now we will create the components for the Prometheus operator + +``` +kubectl --namespace="$NAMESPACE" apply -f manifests/prometheus-operator +``` + +This will create all the Prometheus Operator components. You might need to wait a short amount of time before the Custom Resource Definitions are available in the cluster. You can wait for them: + +``` +until kubectl --namespace="$NAMESPACE" get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +``` + +Next, we will install the node exporter and then kube-state-metrics: + +``` +kubectl --namespace="$NAMESPACE" apply -f manifests/node-exporter +kubectl --namespace="$NAMESPACE" apply -f manifests/kube-state-metrics +``` + +Then, we can deploy the grafana credentials. By default, the username/password will be `admin/admin`, you should change these for your production clusters. + +``` +kubectl --namespace="$NAMESPACE" apply -f manifests/grafana/grafana-credentials.yaml +``` + +Then install grafana itself: + +``` +kubectl --namespace="$NAMESPACE" apply -f manifests/grafana +``` + +Next up is the `Prometheus` object itself. We will deploy the application, and then the roles/role-bindings. + +``` +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; +kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml +``` + +Finally, install the [Alertmanager](../../../Documentation/user-guides/alerting.md) + +``` +kubectl --namespace="$NAMESPACE" apply -f manifests/alertmanager +``` + +Now you should have a working cluster. After all the pods are ready, you should be able to reach: + +* Prometheus UI on node port `30900` +* Alertmanager UI on node port `30903` +* Grafana on node port `30902` + +These can of course be changed via the Service definitions. It is recommended to look at the [Exposing Prometheus and Alert Manager](../../../Documentation/user-guides/exposing-prometheus-and-alertmanager.md) documentation for more detailed information on how to expose these services. From ce207574759e62824b2b259db3182a60d2bdd4c8 Mon Sep 17 00:00:00 2001 From: Nuala Gaffey Date: Mon, 27 Nov 2017 17:04:43 -0500 Subject: [PATCH 161/638] Per issue 593, append .yml to each filename used in the generated configmap. I wasn't able to see alerts in the Prometheus UI until I did this --- hack/scripts/generate-rules-configmap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index 7a98962d..97f99767 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -13,6 +13,6 @@ EOF for f in assets/prometheus/rules/*.rules.y*ml do - echo " $(basename $f): |+" + echo " $(basename $f | sed s/\.yml//): |+" cat $f | sed "s/^/ /g" done From d6a2b717d3f0d46178882687a3e16d87955cff7f Mon Sep 17 00:00:00 2001 From: Xabier Larrakoetxea Date: Tue, 28 Nov 2017 14:52:16 +0100 Subject: [PATCH 162/638] Fix cluster:container_cpu_usage:ratio rule on prometheus kubernetes files Signed-off-by: Xabier Larrakoetxea --- assets/prometheus/rules/kubernetes.rules.yaml | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index f13d0088..537079a4 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -25,7 +25,7 @@ groups: expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores) - record: cluster:container_cpu_usage:ratio - expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores) - record: apiserver_latency_seconds:quantile expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 6493ff74..c7d5736e 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -406,7 +406,7 @@ data: expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores) - record: cluster:container_cpu_usage:ratio - expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores) - record: apiserver_latency_seconds:quantile expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / From bf6461b85160828750b5c035a05383dc2c081297 Mon Sep 17 00:00:00 2001 From: Nuala Gaffey Date: Tue, 28 Nov 2017 19:22:17 -0500 Subject: [PATCH 163/638] Wrapping the filename in quotes preserves the .yml suffix, so regex is no longer necessary --- hack/scripts/generate-rules-configmap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index 97f99767..76ec64ca 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -13,6 +13,6 @@ EOF for f in assets/prometheus/rules/*.rules.y*ml do - echo " $(basename $f | sed s/\.yml//): |+" + echo " $(basename "$f"): |+" cat $f | sed "s/^/ /g" done From 3afc174fc5a4b6c089579cd4d8dd9e9a5e3df925 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 29 Nov 2017 10:59:44 +0100 Subject: [PATCH 164/638] kube-prometheus: Add Prometheus 2.0 rules --- assets/prometheus/rules/prometheus.rules.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index df51d010..e27aa281 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -42,3 +42,30 @@ groups: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing + expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing + expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions + expr: tsdb_wal_corruptions_total > 0 + for: 4h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted From 64425be6b9cfe503230f6d1bd9a16922b937aa56 Mon Sep 17 00:00:00 2001 From: Peter Fiddes Date: Wed, 29 Nov 2017 11:31:42 +0000 Subject: [PATCH 165/638] contrib: namespace warning to kube-prometheus NAMESPACE feature in scripts implies it works. warnning ensures that users are aware that its use requires further changes. Alleviates #765 --- hack/cluster-monitoring/deploy | 3 +++ hack/cluster-monitoring/teardown | 3 +++ 2 files changed, 6 insertions(+) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 818db48c..a4f7c184 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -4,6 +4,9 @@ if [ -z "${KUBECONFIG}" ]; then export KUBECONFIG=~/.kube/config fi +# CAUTION - setting NAMESPACE will deploy most components to the given namespace +# however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests. + if [ -z "${NAMESPACE}" ]; then NAMESPACE=monitoring fi diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index ac4d222d..b2c4c544 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -4,6 +4,9 @@ if [ -z "${KUBECONFIG}" ]; then export KUBECONFIG=~/.kube/config fi +# CAUTION - NAMESPACE must match its value when deploy script was run. +# Some resources are always deployed to the monitoring namespace. + if [ -z "${NAMESPACE}" ]; then NAMESPACE=monitoring fi From fb01fe91dc264e7f75dedf02e6ff51c5a2187129 Mon Sep 17 00:00:00 2001 From: Bradley Date: Thu, 7 Dec 2017 18:35:58 +0000 Subject: [PATCH 166/638] Adding requested and limit values to CPU and limit value to memory --- assets/grafana/pods.dashboard.py | 36 +++++++++++++++++++++++ manifests/grafana/grafana-dashboards.yaml | 27 +++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py index 1003d8a5..cea80f66 100644 --- a/assets/grafana/pods.dashboard.py +++ b/assets/grafana/pods.dashboard.py @@ -132,6 +132,18 @@ dashboard = Dashboard( 'refId': 'B', 'step': 20, }, + { + 'expr': 'kube_pod_container_resource_limits_' + 'memory_bytes{pod="$pod", container=~' + '"$container"}', + 'interval': '10s', + 'intervalFactor': 2, + 'legendFormat': 'Limit: {{ container }}', + 'metric': 'kube_pod_container_resource_' + 'limits_memory_bytes', + 'refId': 'C', + 'step': 20, + }, ], ), ], @@ -170,6 +182,30 @@ dashboard = Dashboard( 'refId': 'A', 'step': 30 }, + { + 'expr': 'kube_pod_container_resource_requests_' + 'cpu_cores{pod="$pod", container=~' + '"$container"}', + 'interval': '10s', + 'intervalFactor': 2, + 'legendFormat': 'Requested: {{ container }}', + 'metric': 'kube_pod_container_resource_' + 'requests_cpu_cores', + 'refId': 'B', + 'step': 20, + }, + { + 'expr': 'kube_pod_container_resource_limits_' + 'cpu_cores{pod="$pod", container=~' + '"$container"}', + 'interval': '10s', + 'intervalFactor': 2, + 'legendFormat': 'Limit: {{ container }}', + 'metric': 'kube_pod_container_resource_' + 'limits_memory_bytes', + 'refId': 'C', + 'step': 20, + }, ], ), ], diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index a55a8df5..3834d748 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -5241,6 +5241,15 @@ data: "metric": "kube_pod_container_resource_requests_memory_bytes", "refId": "B", "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 } ], "title": "Memory Usage", @@ -5327,6 +5336,24 @@ data: "legendFormat": "{{ container_name }}", "refId": "A", "step": 30 + }, + { + "expr": "kube_pod_container_resource_requests_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_cpu_cores", + "refId": "B", + "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 } ], "title": "CPU Usage", From f19cddc429c3f4cfcaf8df62d892ca85608225a5 Mon Sep 17 00:00:00 2001 From: Tyler Roscoe Date: Mon, 18 Dec 2017 19:47:18 -0700 Subject: [PATCH 167/638] Regenerate prometheus-k8s-rules.yaml after change to prometheus.rules.yaml in c4be29f65. --- .../prometheus/prometheus-k8s-rules.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index c7d5736e..8890ae49 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -555,3 +555,30 @@ data: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing + expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing + expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions + expr: tsdb_wal_corruptions_total > 0 + for: 4h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted From a9fedc6343cf16403e5927cd209130f088f3dd27 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 22 Dec 2017 15:24:43 +0100 Subject: [PATCH 168/638] kube-prometheus: Update etcd3 rules --- assets/prometheus/rules/etcd3.rules.yaml | 10 +++++----- manifests/prometheus/prometheus-k8s-rules.yaml | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml index ade2ed62..fe879fe3 100644 --- a/assets/prometheus/rules/etcd3.rules.yaml +++ b/assets/prometheus/rules/etcd3.rules.yaml @@ -26,8 +26,8 @@ groups: changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 for: 10m labels: severity: warning @@ -36,8 +36,8 @@ groups: on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 for: 5m labels: severity: critical @@ -46,7 +46,7 @@ groups: on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15 for: 10m labels: diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 8890ae49..0a667e01 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -66,8 +66,8 @@ data: changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 for: 10m labels: severity: warning @@ -76,8 +76,8 @@ data: on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 for: 5m labels: severity: critical @@ -86,7 +86,7 @@ data: on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15 for: 10m labels: From 91305360976131f79b16732f95cbe182595bd363 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 22 Dec 2017 16:05:42 +0100 Subject: [PATCH 169/638] *: Add kube-prometheus generate make target to generate stage --- Dockerfile | 6 ------ Makefile | 12 ++++++------ 2 files changed, 6 insertions(+), 12 deletions(-) delete mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 78032788..00000000 --- a/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM python:3.6-slim - -RUN apt-get update -y && apt-get install -y git -RUN pip3 install virtualenv - -ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/Makefile b/Makefile index a093ce7a..26084ae4 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ .PHONY: image - -IMAGE := coreos/generate-prometheus-operator-manifests -image: Dockerfile - docker build -t $(IMAGE) . +image: + docker build -f ../../scripts/jsonnet/Dockerfile -t po-jsonnet ../../ -BUILDER := docker run --rm -it --workdir /data -v ${PWD}:/data $(IMAGE) ./hack/scripts/generate-manifests.sh generate: image @echo ">> Compiling assets and generating Kubernetes manifests" - $(BUILDER) + docker run --rm -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw + +generate-raw: + ./hack/scripts/generate-manifests.sh From 22eef956afd2fabea494a97c10f547861d6997fe Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Tue, 28 Nov 2017 10:30:42 +0100 Subject: [PATCH 170/638] Add script to keep kube-prometheus rules in sync with helm charts Bump prometheus to 2.0.0, prometheus-operator to 0.15.0, alertmanager to 0.12.0 and node-exporter to 0.15.1, grafana to 4.6.3 migrate prometheus alerts to yaml notation --- assets/prometheus/rules/kube-controller-manager.rules.yaml | 2 +- .../bin/grafana_dashboards_generate.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml index 4ea82ed1..b73456bd 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -10,4 +10,4 @@ groups: description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down + summary: Controller manager is down \ No newline at end of file diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index b4273baa..e9d7324a 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -203,7 +203,7 @@ addArrayToConfigMap() { # Dashboard foot test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE done - echo "---" + #echo "---" IFS=$OLDIFS return 0 From 68517f63b5e34d6dd497e3a238c63ded8a280b9e Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Sat, 23 Dec 2017 09:41:12 +0100 Subject: [PATCH 171/638] Delete chart exporter-kube-api because it has been replaced by kube-controller-manager alerts --- assets/prometheus/rules/kube-controller-manager.rules.yaml | 2 +- .../bin/grafana_dashboards_generate.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml index b73456bd..4ea82ed1 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -10,4 +10,4 @@ groups: description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down \ No newline at end of file + summary: Controller manager is down diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index e9d7324a..b4273baa 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -203,7 +203,7 @@ addArrayToConfigMap() { # Dashboard foot test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE done - #echo "---" + echo "---" IFS=$OLDIFS return 0 From 4402d451aeacc392794dd6faf3f62486b416414a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 5 Jan 2018 16:03:04 +0100 Subject: [PATCH 172/638] kube-prometheus: Add RBAC authorization to metrics endpoints --- .../kube-state-metrics-cluster-role.yaml | 10 +++- .../kube-state-metrics-deployment.yaml | 46 +++++++++++++++---- .../kube-state-metrics-service.yaml | 11 +++-- .../node-exporter-cluster-role-binding.yaml | 12 +++++ .../node-exporter-cluster-role.yaml | 13 ++++++ .../node-exporter-daemonset.yaml | 26 +++++++++-- .../node-exporter-service-account.yaml | 4 ++ .../node-exporter/node-exporter-service.yaml | 2 +- ...8s-service-monitor-kube-state-metrics.yaml | 12 ++++- ...eus-k8s-service-monitor-node-exporter.yaml | 6 ++- 10 files changed, 121 insertions(+), 21 deletions(-) create mode 100644 manifests/node-exporter/node-exporter-cluster-role-binding.yaml create mode 100644 manifests/node-exporter/node-exporter-cluster-role.yaml create mode 100644 manifests/node-exporter/node-exporter-service-account.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml index 6ae8db88..30583ac0 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -27,4 +27,12 @@ rules: resources: - cronjobs - jobs - verbs: ["list", "watch"] \ No newline at end of file + verbs: ["list", "watch"] +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] \ No newline at end of file diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index ee8526d3..22a84108 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -11,17 +11,43 @@ spec: spec: serviceAccountName: kube-state-metrics containers: - - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.0.1 + - name: kube-rbac-proxy-main + image: quay.io/brancz/kube-rbac-proxy:v0.2.0 + args: + - "--secure-listen-address=:8443" + - "--upstream=http://127.0.0.1:8081/" ports: - - name: metrics - containerPort: 8080 - readinessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 5 - timeoutSeconds: 5 + - name: https-main + containerPort: 8443 + resources: + requests: + memory: 20Mi + cpu: 10m + limits: + memory: 40Mi + cpu: 20m + - name: kube-rbac-proxy-self + image: quay.io/brancz/kube-rbac-proxy:v0.2.0 + args: + - "--secure-listen-address=:9443" + - "--upstream=http://127.0.0.1:8082/" + ports: + - name: https-self + containerPort: 9443 + resources: + requests: + memory: 20Mi + cpu: 10m + limits: + memory: 40Mi + cpu: 20m + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.2.0-rc.0 + args: + - "--host=127.0.0.1" + - "--port=8081" + - "--telemetry-host=127.0.0.1" + - "--telemetry-port=8082" - name: addon-resizer image: gcr.io/google_containers/addon-resizer:1.0 resources: diff --git a/manifests/kube-state-metrics/kube-state-metrics-service.yaml b/manifests/kube-state-metrics/kube-state-metrics-service.yaml index 292c4978..b4422685 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-service.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-service.yaml @@ -6,10 +6,15 @@ metadata: k8s-app: kube-state-metrics name: kube-state-metrics spec: + clusterIP: None ports: - - name: http-metrics - port: 8080 - targetPort: metrics + - name: https-main + port: 8443 + targetPort: https-main + protocol: TCP + - name: https-self + port: 9443 + targetPort: https-self protocol: TCP selector: app: kube-state-metrics diff --git a/manifests/node-exporter/node-exporter-cluster-role-binding.yaml b/manifests/node-exporter/node-exporter-cluster-role-binding.yaml new file mode 100644 index 00000000..a5a20508 --- /dev/null +++ b/manifests/node-exporter/node-exporter-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring diff --git a/manifests/node-exporter/node-exporter-cluster-role.yaml b/manifests/node-exporter/node-exporter-cluster-role.yaml new file mode 100644 index 00000000..932b7762 --- /dev/null +++ b/manifests/node-exporter/node-exporter-cluster-role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index d98deee6..701e491f 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -3,24 +3,26 @@ kind: DaemonSet metadata: name: node-exporter spec: + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate template: metadata: labels: app: node-exporter name: node-exporter spec: + serviceAccountName: node-exporter hostNetwork: true hostPID: true containers: - image: quay.io/prometheus/node-exporter:v0.15.0 args: + - "--web.listen-address=127.0.0.1:9101" - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" name: node-exporter - ports: - - containerPort: 9100 - hostPort: 9100 - name: scrape resources: requests: memory: 30Mi @@ -35,6 +37,22 @@ spec: - name: sys readOnly: true mountPath: /host/sys + - name: kube-rbac-proxy + image: quay.io/brancz/kube-rbac-proxy:v0.2.0 + args: + - "--secure-listen-address=:9100" + - "--upstream=http://127.0.0.1:9101/" + ports: + - containerPort: 9100 + hostPort: 9100 + name: https + resources: + requests: + memory: 20Mi + cpu: 10m + limits: + memory: 40Mi + cpu: 20m tolerations: - effect: NoSchedule operator: Exists diff --git a/manifests/node-exporter/node-exporter-service-account.yaml b/manifests/node-exporter/node-exporter-service-account.yaml new file mode 100644 index 00000000..703a2748 --- /dev/null +++ b/manifests/node-exporter/node-exporter-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter diff --git a/manifests/node-exporter/node-exporter-service.yaml b/manifests/node-exporter/node-exporter-service.yaml index 46b1a3fd..8aa37747 100644 --- a/manifests/node-exporter/node-exporter-service.yaml +++ b/manifests/node-exporter/node-exporter-service.yaml @@ -9,7 +9,7 @@ spec: type: ClusterIP clusterIP: None ports: - - name: http-metrics + - name: https port: 9100 protocol: TCP selector: diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml index 6563a4d4..1433a5fe 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -13,6 +13,16 @@ spec: matchNames: - monitoring endpoints: - - port: http-metrics + - port: https-main + scheme: https interval: 30s honorLabels: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true + - port: https-self + scheme: https + interval: 30s + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml index e1b083bb..0dd72e75 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -13,5 +13,9 @@ spec: matchNames: - monitoring endpoints: - - port: http-metrics + - port: https + scheme: https interval: 30s + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true From 5392443721e3e0d8a225c8e3c6ad428f5f9efaeb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 10 Jan 2018 12:27:14 +0100 Subject: [PATCH 173/638] kube-prometheus: Add etcd dashboard --- assets/grafana/.gitignore | 1 - assets/grafana/generated/.gitignore | 2 + assets/grafana/generated/.keep | 0 .../raw-json-dashboards/etcd-dashboard.json | 1158 ++++++++++++++++ hack/scripts/generate-dashboards-configmap.sh | 28 +- hack/scripts/generate-manifests.sh | 3 +- hack/scripts/wrap-dashboard.sh | 14 +- manifests/grafana/grafana-dashboards.yaml | 1172 +++++++++++++++++ 8 files changed, 2368 insertions(+), 10 deletions(-) delete mode 100644 assets/grafana/.gitignore create mode 100644 assets/grafana/generated/.gitignore create mode 100644 assets/grafana/generated/.keep create mode 100644 assets/grafana/raw-json-dashboards/etcd-dashboard.json diff --git a/assets/grafana/.gitignore b/assets/grafana/.gitignore deleted file mode 100644 index 047d1277..00000000 --- a/assets/grafana/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*-dashboard.json diff --git a/assets/grafana/generated/.gitignore b/assets/grafana/generated/.gitignore new file mode 100644 index 00000000..92063fdc --- /dev/null +++ b/assets/grafana/generated/.gitignore @@ -0,0 +1,2 @@ +*-dashboard.json +*-datasource.json diff --git a/assets/grafana/generated/.keep b/assets/grafana/generated/.keep new file mode 100644 index 00000000..e69de29b diff --git a/assets/grafana/raw-json-dashboards/etcd-dashboard.json b/assets/grafana/raw-json-dashboards/etcd-dashboard.json new file mode 100644 index 00000000..adc7e34f --- /dev/null +++ b/assets/grafana/raw-json-dashboards/etcd-dashboard.json @@ -0,0 +1,1158 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.5.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "etcd sample Grafana dashboard with Prometheus", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 28, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(etcd_server_has_leader)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "etcd_server_has_leader", + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "Up", + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Rate", + "metric": "grpc_server_started_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Failed Rate", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "id": 41, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Watch Streams", + "metric": "grpc_server_handled_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Lease Streams", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Streams", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB Size", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "DB Size", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} WAL fsync", + "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", + "refId": "A", + "step": 4 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB fsync", + "metric": "etcd_disk_backend_commit_duration_seconds_bucket", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk Sync Duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Resident Memory", + "metric": "process_resident_memory_bytes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 5, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic In", + "metric": "etcd_network_client_grpc_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 5, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic Out", + "metric": "etcd_network_client_grpc_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic In", + "metric": "etcd_network_peer_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic Out", + "metric": "etcd_network_peer_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 0, + "id": 40, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Failure Rate", + "metric": "etcd_server_proposals_failed_total", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(etcd_server_proposals_pending)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Pending Total", + "metric": "etcd_server_proposals_pending", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Commit Rate", + "metric": "etcd_server_proposals_committed_total", + "refId": "C", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Apply Rate", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Proposals", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "editable": true, + "error": false, + "fill": 0, + "id": 19, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "changes(etcd_server_leader_changes_seen_total[1d])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Total Leader Elections Per Day", + "metric": "etcd_server_leader_changes_seen_total", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total Leader Elections Per Day", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "etcd", + "version": 4 +} \ No newline at end of file diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index 5b0ed3e8..61140317 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +set +x cat <<-EOF apiVersion: v1 @@ -9,23 +10,42 @@ metadata: data: EOF +for f in assets/grafana/generated/*-dashboard.json +do + rm -rf $f +done + +for f in assets/grafana/generated/*-datasource.json +do + rm -rf $f +done + virtualenv -p python3 .env source .env/bin/activate pip install -Ur requirements.txt for f in assets/grafana/*.dashboard.py do - JSON_FILENAME="$(pwd)/${f%%.*}-dashboard.json" + basefilename=$(basename $f) + JSON_FILENAME="assets/grafana/generated/${basefilename%%.*}-dashboard.json" generate-dashboard $f -o $JSON_FILENAME 2>&1 > /dev/null done -for f in assets/grafana/*-dashboard.json +cp assets/grafana/raw-json-dashboards/*-dashboard.json assets/grafana/generated/ + +for f in assets/grafana/generated/*-dashboard.json do - echo " $(basename $f): |+" - hack/scripts/wrap-dashboard.sh $f | sed "s/^/ /g" + basefilename=$(basename $f) + echo " $basefilename: |+" + if [ "$basefilename" -eq "etcd-dashboard.json" ]; then + hack/scripts/wrap-dashboard.sh $f prometheus-etcd | sed "s/^/ /g" + else + hack/scripts/wrap-dashboard.sh $f prometheus-k8s | sed "s/^/ /g" + fi done for f in assets/grafana/*-datasource.json do + cp $f assets/grafana/generated/ echo " $(basename $f): |+" cat $f | sed "s/^/ /g" done diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index 7f300dac..4826864c 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +set +x # Generate Alert Rules ConfigMap hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml @@ -14,7 +15,7 @@ hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashbo # grafana deployment output file: manifests/grafana/grafana-deployment.yaml test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml test -f manifests/grafana/grafana-deployment.yaml && rm -f manifests/grafana/grafana-deployment.yaml -hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana -o manifests/grafana/grafana-dashboards.yaml -g manifests/grafana/grafana-deployment.yaml +hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboards.yaml -g manifests/grafana/grafana-deployment.yaml # Generate Grafana Credentials Secret hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh index 8eacdf81..02b1c531 100755 --- a/hack/scripts/wrap-dashboard.sh +++ b/hack/scripts/wrap-dashboard.sh @@ -15,12 +15,18 @@ # * Apply new configmap: # kubectl -n monitoring apply -f manifests/grafana/grafana-cm.yaml -if [ "$#" -ne 1 ]; then - echo "Usage: $0 path-to-dashboard.json" +if [ "$#" -ne 2 ]; then + echo "Usage: $0 path-to-dashboard.json grafana-prometheus-datasource-name" exit 1 fi dashboardjson=$1 +datasource_name=$2 +inputname="DS_PROMETHEUS" + +if [ "$datasource_name" -eq "prometheus-etcd" ]; then + $inputname="DS_PROMETHEUS-ETCD" +fi cat < Date: Thu, 11 Jan 2018 10:56:07 +0100 Subject: [PATCH 174/638] kube-prometheus: Make grafana dashboards non-editable --- assets/grafana/_grafanalib.py | 11 +- .../kubernetes-capacity-planning.dashboard.py | 21 +- .../kubernetes-cluster-health.dashboard.py | 13 +- ...bernetes-control-plane-status.dashboard.py | 14 +- .../kubernetes-resource-requests.dashboard.py | 9 +- assets/grafana/nodes.dashboard.py | 17 +- assets/grafana/pods.dashboard.py | 10 +- .../raw-json-dashboards/etcd-dashboard.json | 26 +- manifests/grafana/grafana-dashboards.yaml | 223 +++++++++--------- 9 files changed, 199 insertions(+), 145 deletions(-) diff --git a/assets/grafana/_grafanalib.py b/assets/grafana/_grafanalib.py index f030f101..e1e121e5 100644 --- a/assets/grafana/_grafanalib.py +++ b/assets/grafana/_grafanalib.py @@ -20,18 +20,17 @@ def Dashboard( 'pluginId': 'prometheus', 'pluginName': 'Prometheus' }, - ], rows=rows, graphTooltip=graphTooltip, **optional_args, + ], rows=rows, graphTooltip=graphTooltip, editable=False, **optional_args, ) def Row( - panels, height=None, title='Dashboard Row', showTitle=False, - editable=None + panels, height=None, title='Dashboard Row', showTitle=False ): assert isinstance(height, (type(None), int)) return core.Row( panels=panels, height=height, title=title, showTitle=showTitle, - titleSize='h6', editable=editable, + titleSize='h6', editable=False, ) @@ -61,7 +60,7 @@ def SingleStat( mappingTypes=mappingTypes, targets=targets, mappingType=mappingType, format=format, colors=colors, span=span, postfix=postfix, sparkline=sparkline, prefixFontSize=prefixFontSize, - hideTimeOverride=None, transparent=transparent, + hideTimeOverride=None, transparent=transparent, editable=False, ) @@ -82,7 +81,7 @@ def Graph( return core.Graph( id=id, title=title, dashLength=dashLength, dashes=dashes, spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes, - dataSource='${DS_PROMETHEUS}', nullPointMode=nullPointMode, + dataSource='${DS_PROMETHEUS}', nullPointMode=nullPointMode, editable=False, ) diff --git a/assets/grafana/kubernetes-capacity-planning.dashboard.py b/assets/grafana/kubernetes-capacity-planning.dashboard.py index bf8762bd..00e5ada2 100644 --- a/assets/grafana/kubernetes-capacity-planning.dashboard.py +++ b/assets/grafana/kubernetes-capacity-planning.dashboard.py @@ -7,6 +7,7 @@ dashboard = Dashboard( gnetId=22, graphTooltip=0, refresh=False, + editable=False, schemaVersion=14, time=Time(start='now-1h'), timezone='browser', @@ -22,7 +23,7 @@ dashboard = Dashboard( ], rows=[ Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Idle CPU', @@ -31,6 +32,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), @@ -57,6 +59,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), @@ -94,7 +97,7 @@ dashboard = Dashboard( ], ), Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Memory Usage', @@ -103,6 +106,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=9, stack=True, @@ -171,6 +175,7 @@ dashboard = Dashboard( format='percent', span=3, gauge=Gauge(show=True), + editable=False, thresholds='80, 90', valueMaps=[ { @@ -203,7 +208,7 @@ dashboard = Dashboard( ], ), Row( - height=246, title='New Row', showTitle=False, + height=246, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Disk I/O', @@ -212,6 +217,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=9, tooltip=Tooltip(msResolution=False), @@ -264,6 +270,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', id=12, span=3, + editable=False, format='percentunit', valueName='current', gauge=Gauge( @@ -301,7 +308,7 @@ dashboard = Dashboard( ] ), Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Network Received', @@ -310,6 +317,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), @@ -343,6 +351,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), @@ -372,7 +381,7 @@ dashboard = Dashboard( ], ), Row( - height=276, title='New Row', showTitle=False, + height=276, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( @@ -381,6 +390,7 @@ dashboard = Dashboard( id=11, span=9, dashes=False, + editable=False, spaceLength=11, tooltip=Tooltip( msResolution=False, @@ -413,6 +423,7 @@ dashboard = Dashboard( title='Pod Utilization', dataSource='${DS_PROMETHEUS}', id=7, + editable=False, span=3, format='percent', rangeMaps=[ diff --git a/assets/grafana/kubernetes-cluster-health.dashboard.py b/assets/grafana/kubernetes-cluster-health.dashboard.py index e7296f67..dbd402f3 100644 --- a/assets/grafana/kubernetes-cluster-health.dashboard.py +++ b/assets/grafana/kubernetes-cluster-health.dashboard.py @@ -6,6 +6,7 @@ dashboard = Dashboard( version=9, graphTooltip=0, schemaVersion=14, + editable=False, time=Time(start='now-6h'), timezone='browser', inputs=[ @@ -20,7 +21,7 @@ dashboard = Dashboard( ], rows=[ Row( - height=254, title='Row', showTitle=False, + height=254, title='Row', showTitle=False, editable=False, titleSize='h6', panels=[ SingleStat( title='Control Plane Components Down', @@ -30,6 +31,7 @@ dashboard = Dashboard( span=3, thresholds='1, 3', colorValue=True, + editable=False, rangeMaps=[ { 'from': 'null', @@ -77,6 +79,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', @@ -122,6 +125,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='3, 5', @@ -167,6 +171,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', @@ -209,7 +214,7 @@ dashboard = Dashboard( ], ), Row( - height=250, title='Row', showTitle=False, + height=250, title='Row', showTitle=False, editable=False, titleSize='h6', panels=[ SingleStat( title='Node Not Ready', @@ -217,6 +222,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', @@ -262,6 +268,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', @@ -307,6 +314,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', @@ -352,6 +360,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', gauge=Gauge(), colorValue=True, + editable=False, span=3, valueName='current', thresholds='1, 3', diff --git a/assets/grafana/kubernetes-control-plane-status.dashboard.py b/assets/grafana/kubernetes-control-plane-status.dashboard.py index b9020675..fad157f5 100644 --- a/assets/grafana/kubernetes-control-plane-status.dashboard.py +++ b/assets/grafana/kubernetes-control-plane-status.dashboard.py @@ -8,6 +8,7 @@ dashboard = Dashboard( time=Time(start='now-6h'), timezone='browser', refresh=None, + editable=False, inputs=[ { 'name': 'DS_PROMETHEUS', @@ -20,12 +21,13 @@ dashboard = Dashboard( ], rows=[ Row( - title='Dashboard Row', showTitle=False, titleSize='h6', + title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, panels=[ SingleStat( title='API Servers UP', dataSource='${DS_PROMETHEUS}', format='percent', + editable=False, gauge=Gauge( show=True, ), @@ -76,6 +78,7 @@ dashboard = Dashboard( title='Controller Managers UP', dataSource='${DS_PROMETHEUS}', format='percent', + editable=False, gauge=Gauge( show=True, ), @@ -127,6 +130,7 @@ dashboard = Dashboard( title='Schedulers UP', dataSource='${DS_PROMETHEUS}', format='percent', + editable=False, gauge=Gauge( show=True, ), @@ -177,6 +181,7 @@ dashboard = Dashboard( title='API Server Request Error Rate', dataSource='${DS_PROMETHEUS}', format='percent', + editable=False, gauge=Gauge( show=True, ), @@ -224,7 +229,7 @@ dashboard = Dashboard( ], ), Row( - title='Dashboard Row', showTitle=False, titleSize='h6', + title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, panels=[ Graph( title='API Server Request Latency', @@ -233,6 +238,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, lineWidth=1, nullPointMode='null', tooltip=Tooltip( @@ -258,13 +264,14 @@ dashboard = Dashboard( ], ), Row( - title='Dashboard Row', showTitle=False, titleSize='h6', + title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, panels=[ Graph( title='End to End Scheduling Latency', id=5, dataSource='${DS_PROMETHEUS}', isNew=False, + editable=False, dashLength=10, lineWidth=1, nullPointMode="null", @@ -295,6 +302,7 @@ dashboard = Dashboard( id=6, dataSource='${DS_PROMETHEUS}', isNew=False, + editable=False, dashLength=10, lineWidth=1, nullPointMode="null", diff --git a/assets/grafana/kubernetes-resource-requests.dashboard.py b/assets/grafana/kubernetes-resource-requests.dashboard.py index 3b3b6157..6e4c510c 100644 --- a/assets/grafana/kubernetes-resource-requests.dashboard.py +++ b/assets/grafana/kubernetes-resource-requests.dashboard.py @@ -6,6 +6,7 @@ dashboard = Dashboard( version=2, graphTooltip=0, refresh=False, + editable=False, schemaVersion=14, time=Time(start='now-3h'), timezone='browser', @@ -21,7 +22,7 @@ dashboard = Dashboard( ], rows=[ Row( - height=300, title='CPU Cores', showTitle=False, + height=300, title='CPU Cores', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='CPU Cores', @@ -37,6 +38,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, lineWidth=1, spaceLength=10, nullPointMode='null', @@ -74,6 +76,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', id=2, format='percent', + editable=False, span=3, gauge=Gauge(show=True), sparkline=SparkLine(show=True), @@ -109,7 +112,7 @@ dashboard = Dashboard( ], ), Row( - height=300, title='Memory', showTitle=False, + height=300, title='Memory', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Memory', @@ -126,6 +129,7 @@ dashboard = Dashboard( dashes=False, lineWidth=1, isNew=False, + editable=False, spaceLength=10, span=9, nullPointMode='null', @@ -165,6 +169,7 @@ dashboard = Dashboard( span=3, gauge=Gauge(show=True), sparkline=SparkLine(show=True), + editable=False, valueFontSize='110%', thresholds='80, 90', valueMaps=[ diff --git a/assets/grafana/nodes.dashboard.py b/assets/grafana/nodes.dashboard.py index f2e7b18e..89d2b1f9 100644 --- a/assets/grafana/nodes.dashboard.py +++ b/assets/grafana/nodes.dashboard.py @@ -8,6 +8,7 @@ dashboard = Dashboard( gnetId=22, graphTooltip=0, refresh=False, + editable=False, schemaVersion=14, time=Time(start='now-1h'), timezone='browser', @@ -45,13 +46,14 @@ dashboard = Dashboard( ]), rows=[ Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Idle CPU', dataSource='${DS_PROMETHEUS}', id=3, isNew=False, + editable=False, spaceLength=10, span=6, dashLength=10, @@ -83,6 +85,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', id=9, isNew=False, + editable=False, spaceLength=10, span=6, dashLength=10, @@ -122,13 +125,14 @@ dashboard = Dashboard( ], ), Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Memory Usage', dataSource='${DS_PROMETHEUS}', id=4, isNew=False, + editable=False, spaceLength=10, span=9, stack=True, @@ -198,6 +202,7 @@ dashboard = Dashboard( id=5, format='percent', gauge=Gauge(show=True), + editable=False, span=3, rangeMaps=[ { @@ -232,7 +237,7 @@ dashboard = Dashboard( ], ), Row( - height=250, titleSize='h6', title='New Row', + height=250, titleSize='h6', title='New Row', editable=False, showTitle=False, panels=[ Graph( title='Disk I/O', @@ -240,6 +245,7 @@ dashboard = Dashboard( id=6, dashLength=10, dashes=False, + editable=False, spaceLength=10, span=9, tooltip=Tooltip(msResolution=False), @@ -301,6 +307,7 @@ dashboard = Dashboard( dataSource='${DS_PROMETHEUS}', id=7, thresholds='0.75, 0.9', + editable=False, valueName='current', format='percentunit', span=3, @@ -340,7 +347,7 @@ dashboard = Dashboard( ), Row( height=250, title='New Row', titleSize='h6', - showTitle=False, + showTitle=False, editable=False, panels=[ Graph( title='Network Received', @@ -349,6 +356,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), @@ -382,6 +390,7 @@ dashboard = Dashboard( dashLength=10, dashes=False, isNew=False, + editable=False, spaceLength=10, span=6, tooltip=Tooltip(msResolution=False), diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py index cea80f66..f5258cbc 100644 --- a/assets/grafana/pods.dashboard.py +++ b/assets/grafana/pods.dashboard.py @@ -6,6 +6,7 @@ dashboard = Dashboard( version=1, graphTooltip=1, refresh=False, + editable=False, schemaVersion=14, time=Time(start='now-6h'), timezone='browser', @@ -85,13 +86,14 @@ dashboard = Dashboard( ]), rows=[ Row( - height=250, title='Row', showTitle=False, + height=250, title='Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Memory Usage', dataSource='${DS_PROMETHEUS}', id=1, isNew=False, + editable=False, spaceLength=10, span=12, dashLength=10, @@ -149,13 +151,14 @@ dashboard = Dashboard( ], ), Row( - height=250, title='Row', showTitle=False, + height=250, title='Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='CPU Usage', dataSource='${DS_PROMETHEUS}', id=2, isNew=False, + editable=False, spaceLength=10, span=12, dashLength=10, @@ -211,13 +214,14 @@ dashboard = Dashboard( ], ), Row( - height=250, title='New Row', showTitle=False, + height=250, title='New Row', showTitle=False, editable=False, titleSize='h6', panels=[ Graph( title='Network I/O', dataSource='${DS_PROMETHEUS}', id=3, isNew=False, + editable=False, spaceLength=10, span=12, dashLength=10, diff --git a/assets/grafana/raw-json-dashboards/etcd-dashboard.json b/assets/grafana/raw-json-dashboards/etcd-dashboard.json index adc7e34f..0098ffea 100644 --- a/assets/grafana/raw-json-dashboards/etcd-dashboard.json +++ b/assets/grafana/raw-json-dashboards/etcd-dashboard.json @@ -39,7 +39,7 @@ "list": [] }, "description": "etcd sample Grafana dashboard with Prometheus", - "editable": true, + "editable": false, "gnetId": null, "graphTooltip": 0, "hideControls": false, @@ -61,7 +61,7 @@ "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "format": "none", "gauge": { @@ -137,7 +137,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 23, @@ -226,7 +226,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 41, @@ -328,7 +328,7 @@ "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": null, - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -409,7 +409,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -498,7 +498,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 29, @@ -590,7 +590,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 5, "id": 22, @@ -670,7 +670,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 5, "id": 21, @@ -750,7 +750,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 20, @@ -831,7 +831,7 @@ "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": null, - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -924,7 +924,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 40, @@ -1031,7 +1031,7 @@ "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 19, diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 3d6e4410..82428564 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -20,13 +20,14 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 1, "hideControls": false, "links": [], "rows": [ { "collapse": false, + "editable": false, "height": "200px", "panels": [ { @@ -38,7 +39,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -109,7 +110,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -180,7 +181,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "Bps", "gauge": { "maxValue": 100, @@ -249,6 +250,7 @@ data: }, { "collapse": false, + "editable": false, "height": "100px", "panels": [ { @@ -260,7 +262,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -331,7 +333,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -401,7 +403,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -471,7 +473,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -539,6 +541,7 @@ data: }, { "collapse": false, + "editable": false, "height": "350px", "panels": [ { @@ -547,7 +550,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -788,7 +791,7 @@ data: "list": [] }, "description": "etcd sample Grafana dashboard with Prometheus", - "editable": true, + "editable": false, "gnetId": null, "graphTooltip": 0, "hideControls": false, @@ -810,7 +813,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "format": "none", "gauge": { @@ -886,7 +889,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 23, @@ -975,7 +978,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 41, @@ -1077,7 +1080,7 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": null, - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -1158,7 +1161,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -1247,7 +1250,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 29, @@ -1339,7 +1342,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 5, "id": 22, @@ -1419,7 +1422,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 5, "id": 21, @@ -1499,7 +1502,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 20, @@ -1580,7 +1583,7 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": null, - "editable": true, + "editable": false, "error": false, "fill": 0, "grid": {}, @@ -1673,7 +1676,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 40, @@ -1780,7 +1783,7 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, - "editable": true, + "editable": false, "error": false, "fill": 0, "id": 19, @@ -1933,7 +1936,7 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "gnetId": 22, "graphTooltip": 0, "hideControls": false, @@ -1942,7 +1945,7 @@ data: "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -1951,7 +1954,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2029,7 +2032,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2122,7 +2125,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -2131,7 +2134,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2247,7 +2250,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -2321,7 +2324,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "246px", "panels": [ { @@ -2330,7 +2333,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2437,7 +2440,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percentunit", "gauge": { "maxValue": 1, @@ -2510,7 +2513,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -2519,7 +2522,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2601,7 +2604,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2684,7 +2687,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "276px", "panels": [ { @@ -2692,7 +2695,7 @@ data: "bars": false, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -2779,7 +2782,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -2921,7 +2924,7 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 0, "hideControls": false, "links": [], @@ -2929,7 +2932,7 @@ data: "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "254px", "panels": [ { @@ -2941,7 +2944,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3022,7 +3025,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3098,7 +3101,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3174,7 +3177,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3248,7 +3251,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -3260,7 +3263,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3336,7 +3339,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3412,7 +3415,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3488,7 +3491,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3629,13 +3632,14 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 0, "hideControls": false, "links": [], "rows": [ { "collapse": false, + "editable": false, "height": "129px", "panels": [ { @@ -3647,7 +3651,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3719,7 +3723,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -3789,6 +3793,7 @@ data: }, { "collapse": false, + "editable": false, "height": "168px", "panels": [ { @@ -3800,7 +3805,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -3872,7 +3877,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -3944,7 +3949,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4016,7 +4021,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -4086,6 +4091,7 @@ data: }, { "collapse": false, + "editable": false, "height": "158px", "panels": [ { @@ -4097,7 +4103,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4169,7 +4175,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4241,7 +4247,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4313,7 +4319,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4450,14 +4456,14 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 0, "hideControls": false, "links": [], "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -4469,7 +4475,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4544,7 +4550,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4619,7 +4625,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4694,7 +4700,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -4768,7 +4774,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -4777,7 +4783,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -4854,7 +4860,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -4863,7 +4869,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -4938,7 +4944,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5090,7 +5096,7 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 0, "hideControls": false, "links": [], @@ -5098,7 +5104,7 @@ data: "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "300px", "panels": [ { @@ -5108,7 +5114,7 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5196,7 +5202,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -5269,7 +5275,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "300px", "panels": [ { @@ -5279,7 +5285,7 @@ data: "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5367,7 +5373,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -5508,7 +5514,7 @@ data: "list": [] }, "description": "Dashboard to get an overview of one server", - "editable": true, + "editable": false, "gnetId": 22, "graphTooltip": 0, "hideControls": false, @@ -5517,7 +5523,7 @@ data: "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -5526,7 +5532,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5605,7 +5611,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5698,7 +5704,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -5707,7 +5713,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -5819,7 +5825,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percent", "gauge": { "maxValue": 100, @@ -5892,7 +5898,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -5901,7 +5907,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6008,7 +6014,7 @@ data: "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "percentunit", "gauge": { "maxValue": 1, @@ -6081,7 +6087,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -6090,7 +6096,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6172,7 +6178,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6343,7 +6349,7 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 1, "hideControls": false, "links": [], @@ -6351,7 +6357,7 @@ data: "rows": [ { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -6360,7 +6366,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6457,7 +6463,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -6466,7 +6472,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6561,7 +6567,7 @@ data: }, { "collapse": false, - "editable": true, + "editable": false, "height": "250px", "panels": [ { @@ -6570,7 +6576,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { @@ -6775,13 +6781,14 @@ data: "annotations": { "list": [] }, - "editable": true, + "editable": false, "graphTooltip": 1, "hideControls": false, "links": [], "rows": [ { "collapse": false, + "editable": false, "height": "200px", "panels": [ { @@ -6793,7 +6800,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -6864,7 +6871,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -6935,7 +6942,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "Bps", "gauge": { "maxValue": 100, @@ -7004,6 +7011,7 @@ data: }, { "collapse": false, + "editable": false, "height": "100px", "panels": [ { @@ -7015,7 +7023,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -7086,7 +7094,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -7156,7 +7164,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -7226,7 +7234,7 @@ data: "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "format": "none", "gauge": { "maxValue": 100, @@ -7294,6 +7302,7 @@ data: }, { "collapse": false, + "editable": false, "height": "350px", "panels": [ { @@ -7302,7 +7311,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", - "editable": true, + "editable": false, "error": false, "fill": 1, "grid": { From d379175bb28cfbda5353c37eb06afb2489958f9e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 11 Jan 2018 20:20:22 +0100 Subject: [PATCH 175/638] *: bump version to v0.16.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index cafc2064..db027925 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.15.0 + image: quay.io/coreos/prometheus-operator:v0.16.0 name: prometheus-operator ports: - containerPort: 8080 From 4d4323dce57445361a83193ad33ed7302bd43ad6 Mon Sep 17 00:00:00 2001 From: SilverFox Date: Sat, 13 Jan 2018 08:44:53 +0800 Subject: [PATCH 176/638] kube-prometheus: Add missing RBAC rules to kube-state-metrics --- .../kube-state-metrics-cluster-role.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml index 30583ac0..ef5e91ac 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -12,6 +12,9 @@ rules: - replicationcontrollers - limitranges - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints verbs: ["list", "watch"] - apiGroups: ["extensions"] resources: @@ -28,6 +31,10 @@ rules: - cronjobs - jobs verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] - apiGroups: ["authentication.k8s.io"] resources: - tokenreviews From 85f384876e25e95bc1745a695e6b715d2551beee Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 13 Jan 2018 11:39:31 +0100 Subject: [PATCH 177/638] Update kube-state-metrics rules to 1.2 (#884) * Update kube-state-metrics rules to 1.2 * Run make generate to update all manifests * Fix the helm chart kube-state-metrics rules --- assets/prometheus/rules/kube-state-metrics.rules.yaml | 10 +++++++--- manifests/prometheus/prometheus-k8s-rules.yaml | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml index 32b99fa2..9325df0b 100644 --- a/assets/prometheus/rules/kube-state-metrics.rules.yaml +++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml @@ -8,7 +8,8 @@ groups: severity: warning annotations: description: Observed deployment generation does not match expected one for - deployment {{$labels.namespaces}}{{$labels.deployment}} + deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment is outdated - alert: DeploymentReplicasNotUpdated expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) @@ -18,8 +19,9 @@ groups: severity: warning annotations: description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment replicas are outdated - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 for: 15m labels: @@ -27,6 +29,7 @@ groups: annotations: description: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespaces}}/{{$labels.daemonset}} + summary: DaemonSet is missing pods - alert: K8SDaemonSetsNotScheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 @@ -46,10 +49,11 @@ groups: to run. summary: Daemonsets are not scheduled correctly - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts[1h]) > 5 + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 for: 10m labels: severity: warning annotations: description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} times within the last hour + summary: Pod is restarting frequently diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 0a667e01..7011423b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -285,7 +285,8 @@ data: severity: warning annotations: description: Observed deployment generation does not match expected one for - deployment {{$labels.namespaces}}{{$labels.deployment}} + deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment is outdated - alert: DeploymentReplicasNotUpdated expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) @@ -295,8 +296,9 @@ data: severity: warning annotations: description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment replicas are outdated - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 for: 15m labels: @@ -304,6 +306,7 @@ data: annotations: description: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespaces}}/{{$labels.daemonset}} + summary: DaemonSet is missing pods - alert: K8SDaemonSetsNotScheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 @@ -323,13 +326,14 @@ data: to run. summary: Daemonsets are not scheduled correctly - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts[1h]) > 5 + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 for: 10m labels: severity: warning annotations: description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} times within the last hour + summary: Pod is restarting frequently kubelet.rules.yaml: |+ groups: - name: kubelet.rules From aacc95b74cd42127f258ee31cdfadf12a27008b6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 11 Jan 2018 15:18:12 +0100 Subject: [PATCH 178/638] kube-prometheus: bump various versions --- assets/prometheus/rules/etcd3.rules.yaml | 2 +- .../templates/grafana-deployment-template.yaml | 2 +- manifests/alertmanager/alertmanager.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- manifests/kube-state-metrics/kube-state-metrics-deployment.yaml | 2 +- manifests/node-exporter/node-exporter-daemonset.yaml | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml index fe879fe3..a16bf016 100644 --- a/assets/prometheus/rules/etcd3.rules.yaml +++ b/assets/prometheus/rules/etcd3.rules.yaml @@ -86,7 +86,7 @@ groups: }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15 for: 10m labels: diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index b6b0f7ec..8a7b8c02 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.5.2 + image: grafana/grafana:4.6.3 env: - name: GF_AUTH_BASIC_ENABLED value: "true" diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index 22259ef4..00651044 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: main spec: replicas: 3 - version: v0.9.1 + version: v0.13.0 diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 4907dceb..29dd9022 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.5.2 + image: grafana/grafana:4.6.3 env: - name: GF_AUTH_BASIC_ENABLED value: "true" diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 22a84108..6a79f02b 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -42,7 +42,7 @@ spec: memory: 40Mi cpu: 20m - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.2.0-rc.0 + image: quay.io/coreos/kube-state-metrics:v1.2.0 args: - "--host=127.0.0.1" - "--port=8081" diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index 701e491f..250398bd 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -17,7 +17,7 @@ spec: hostNetwork: true hostPID: true containers: - - image: quay.io/prometheus/node-exporter:v0.15.0 + - image: quay.io/prometheus/node-exporter:v0.15.2 args: - "--web.listen-address=127.0.0.1:9101" - "--path.procfs=/host/proc" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 0a667e01..5235ff2b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -126,7 +126,7 @@ data: }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15 for: 10m labels: From 28a022e5c97836474bdfaf31a18c738660015c50 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 16 Jan 2018 10:19:15 +0100 Subject: [PATCH 179/638] *: bump versions to v0.16.1 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index db027925..b773021c 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.16.0 + image: quay.io/coreos/prometheus-operator:v0.16.1 name: prometheus-operator ports: - containerPort: 8080 From bcb0ba997423073238f987e9535b0849579a1cb2 Mon Sep 17 00:00:00 2001 From: Antoine Legrand <2t.antoine@gmail.com> Date: Thu, 14 Dec 2017 17:13:50 +0100 Subject: [PATCH 180/638] Add cert expiration rules --- assets/prometheus/rules/kubernetes.rules.yaml | 14 ++++++++++++++ manifests/prometheus/prometheus-k8s-rules.yaml | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index 537079a4..f961ce6b 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -84,3 +84,17 @@ groups: annotations: description: No API servers are reachable or all have disappeared from service discovery + + - alert: K8sCertificateExpirationNotice + labels: + severity: warning + annotations: + description: Kubernetes API Certificate is expiring soon (less than 7 days) + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 + + - alert: K8sCertificateExpirationNotice + labels: + severity: critical + annotations: + description: Kubernetes API Certificate is expiring in less than 1 day + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index b844d160..d563a571 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -469,6 +469,20 @@ data: annotations: description: No API servers are reachable or all have disappeared from service discovery + + - alert: K8sCertificateExpirationNotice + labels: + severity: warning + annotations: + description: Kubernetes API Certificate is expiring soon (less than 7 days) + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 + + - alert: K8sCertificateExpirationNotice + labels: + severity: critical + annotations: + description: Kubernetes API Certificate is expiring in less than 1 day + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 node.rules.yaml: |+ groups: - name: node.rules From 7bac2a97d4e38c33b5e8afa6326659dfe25fbd21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20Nu=C3=B1ez?= Date: Wed, 17 Jan 2018 14:32:29 +0100 Subject: [PATCH 181/638] add permissions also on k8s manifest --- manifests/prometheus/prometheus-k8s-roles.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml index 14302ea0..4f738e77 100644 --- a/manifests/prometheus/prometheus-k8s-roles.yaml +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -47,5 +47,9 @@ kind: ClusterRole metadata: name: prometheus-k8s rules: +- apiGroups: [""] + resources: + - nodes/metrics + verbs: ["get"] - nonResourceURLs: ["/metrics"] verbs: ["get"] From f7ee14685ac2c4abdfbcb78b7c1da611798a6c88 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 8 Jan 2018 17:20:45 +0100 Subject: [PATCH 182/638] kube-prometheus: Use secure kubelet metrics endpoints --- .../prometheus-k8s-service-monitor-kubelet.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 60ddc0c4..16c9752d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -7,11 +7,20 @@ metadata: spec: jobLabel: k8s-app endpoints: - - port: http-metrics + - port: https-metrics + scheme: https interval: 30s - - port: cadvisor + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: https-metrics + scheme: https + path: /metrics/cadvisor interval: 30s honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token selector: matchLabels: k8s-app: kubelet From f97b6af095b040f6d0cb4bdba818025f68b085c4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 Jan 2018 22:32:21 +0100 Subject: [PATCH 183/638] *: Adapt documentation and scripts to use minikube kubelet authN/authZ --- README.md | 12 ++++++++ hack/cluster-monitoring/minikube-deploy | 13 +++++++- .../k8s/minikube/kube-controller-manager.yaml | 30 ------------------- manifests/k8s/minikube/kube-scheduler.yaml | 30 ------------------- 4 files changed, 24 insertions(+), 61 deletions(-) delete mode 100644 manifests/k8s/minikube/kube-controller-manager.yaml delete mode 100644 manifests/k8s/minikube/kube-scheduler.yaml diff --git a/README.md b/README.md index 804dc942..4ada050d 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,18 @@ CoreOS' Tectonic technology. Otherwise, you can simply make use of repository are adapted to work with a [multi-node setup](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node) using [bootkube](https://github.com/kubernetes-incubator/bootkube). + +> We assume that the kubelet uses token authN and authZ, as otherwise +> Prometheus needs a client certificate, which gives it full access to the +> kubelet, rather than just the metrics. Token authN and authZ allows more fine +> grained and easier access control. Simply start minikube with the following +> command (you can of course adapt the version and memory to your needs): +> +> $ minikube delete && minikube start --kubernetes-version=v1.9.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +> +> In future versions of minikube and kubeadm this will be the default, but for +> the time being, we will have to configure it ourselves. + ## Monitoring Kubernetes The manifests here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), diff --git a/hack/cluster-monitoring/minikube-deploy b/hack/cluster-monitoring/minikube-deploy index ab7e72e4..64cb86be 100755 --- a/hack/cluster-monitoring/minikube-deploy +++ b/hack/cluster-monitoring/minikube-deploy @@ -1,6 +1,17 @@ #!/usr/bin/env bash +# We assume that the kubelet uses token authN and authZ, as otherwise +# Prometheus needs a client certificate, which gives it full access to the +# kubelet, rather than just the metrics. Token authN and authZ allows more fine +# grained and easier access control. Simply start minikube with the following +# command (you can of course adapt the version and memory to your needs): +# +# $ minikube delete && minikube start --kubernetes-version=v1.9.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +# +# In future versions of minikube and kubeadm this will be the default, but for +# the time being, we will have to configure it ourselves. + hack/cluster-monitoring/deploy -awk 'FNR==1{print "---"}1' manifests/k8s/minikube/*.yaml | sed s/MINIKUBE_IP/`minikube ip`/g | kubectl --namespace=kube-system apply -f - +kubectl --namespace=kube-system apply -f manifests/k8s/kubeadm/ diff --git a/manifests/k8s/minikube/kube-controller-manager.yaml b/manifests/k8s/minikube/kube-controller-manager.yaml deleted file mode 100644 index d33015aa..00000000 --- a/manifests/k8s/minikube/kube-controller-manager.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-controller-manager-prometheus-discovery - labels: - k8s-app: kube-controller-manager -spec: - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10252 - targetPort: 10252 - protocol: TCP ---- -apiVersion: v1 -kind: Endpoints -metadata: - namespace: kube-system - name: kube-controller-manager-prometheus-discovery - labels: - k8s-app: kube-controller-manager -subsets: -- addresses: - - ip: MINIKUBE_IP - ports: - - name: http-metrics - port: 10252 - protocol: TCP diff --git a/manifests/k8s/minikube/kube-scheduler.yaml b/manifests/k8s/minikube/kube-scheduler.yaml deleted file mode 100644 index 8599575c..00000000 --- a/manifests/k8s/minikube/kube-scheduler.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-scheduler-prometheus-discovery - labels: - k8s-app: kube-scheduler -spec: - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10251 - targetPort: 10251 - protocol: TCP ---- -apiVersion: v1 -kind: Endpoints -metadata: - namespace: kube-system - name: kube-scheduler-prometheus-discovery - labels: - k8s-app: kube-scheduler -subsets: -- addresses: - - ip: MINIKUBE_IP - ports: - - name: http-metrics - port: 10251 - protocol: TCP From 52990f7951faef918a84a95799221d8e43edf853 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 19 Jan 2018 15:18:53 +0100 Subject: [PATCH 184/638] kube-prometheus: Use non-root and Prometheus v2.1.0 --- .../kube-state-metrics-deployment.yaml | 9 ++++++--- manifests/prometheus/prometheus-k8s.yaml | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 6a79f02b..61f918eb 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -10,6 +10,9 @@ spec: app: kube-state-metrics spec: serviceAccountName: kube-state-metrics + securityContext: + runAsNonRoot: true + runAsUser: 65534 containers: - name: kube-rbac-proxy-main image: quay.io/brancz/kube-rbac-proxy:v0.2.0 @@ -70,8 +73,8 @@ spec: - /pod_nanny - --container=kube-state-metrics - --cpu=100m - - --extra-cpu=1m - - --memory=100Mi - - --extra-memory=2Mi + - --extra-cpu=2m + - --memory=150Mi + - --extra-memory=30Mi - --threshold=5 - --deployment=kube-state-metrics diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 08a71023..401784fa 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v2.0.0 + version: v2.1.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 6afb6bce43730c3496b4420b3527d65c1865481d Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 19 Jan 2018 16:34:36 +0100 Subject: [PATCH 185/638] *: re-generate manifests --- manifests/prometheus-operator/prometheus-operator.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index b773021c..71af2d7d 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -27,4 +27,7 @@ spec: requests: cpu: 100m memory: 50Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 serviceAccountName: prometheus-operator From 546a2e6ac6f9ff5ff216a8edffc7f03a4a76577a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 22 Jan 2018 14:42:15 +0100 Subject: [PATCH 186/638] *: Use non-root --- grafana-image/Dockerfile | 15 +++++++++++++++ grafana-image/Makefile | 2 ++ grafana-image/config.toml | 2 ++ .../templates/grafana-deployment-template.yaml | 7 +++++-- manifests/grafana/grafana-deployment.yaml | 7 +++++-- .../node-exporter/node-exporter-daemonset.yaml | 3 +++ 6 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 grafana-image/Dockerfile create mode 100644 grafana-image/Makefile create mode 100644 grafana-image/config.toml diff --git a/grafana-image/Dockerfile b/grafana-image/Dockerfile new file mode 100644 index 00000000..bac01b59 --- /dev/null +++ b/grafana-image/Dockerfile @@ -0,0 +1,15 @@ +FROM debian:9.3-slim + +RUN apt-get update && apt-get install -qq -y wget tar sqlite && \ + wget -O /tmp/grafana.tar.gz https://s3-us-west-2.amazonaws.com/grafana-releases/release/grafana-4.6.3.linux-x64.tar.gz && \ + tar -zxvf /tmp/grafana.tar.gz -C /tmp && mv /tmp/grafana-4.6.3 /grafana && \ + rm -rf /tmp/grafana.tar.gz + +ADD config.toml /grafana/conf/config.toml + +USER nobody +EXPOSE 3000 +VOLUME [ "/data" ] +WORKDIR /grafana +ENTRYPOINT [ "/grafana/bin/grafana-server" ] +CMD [ "-config=/grafana/conf/config.toml" ] diff --git a/grafana-image/Makefile b/grafana-image/Makefile new file mode 100644 index 00000000..12fc7b81 --- /dev/null +++ b/grafana-image/Makefile @@ -0,0 +1,2 @@ +container: + docker build . -t quay.io/coreos/monitoring-grafana:4.6.3-non-root diff --git a/grafana-image/config.toml b/grafana-image/config.toml new file mode 100644 index 00000000..7ed992c6 --- /dev/null +++ b/grafana-image/config.toml @@ -0,0 +1,2 @@ +[database] +path = /data/grafana.db diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index 8a7b8c02..091d4e80 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -9,9 +9,12 @@ spec: labels: app: grafana spec: + securityContext: + runAsNonRoot: true + runAsUser: 65534 containers: - name: grafana - image: grafana/grafana:4.6.3 + image: quay.io/coreos/monitoring-grafana:4.6.3-non-root env: - name: GF_AUTH_BASIC_ENABLED value: "true" @@ -29,7 +32,7 @@ spec: key: password volumeMounts: - name: grafana-storage - mountPath: /var/grafana-storage + mountPath: /data ports: - name: web containerPort: 3000 diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 29dd9022..d1b7c806 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -9,9 +9,12 @@ spec: labels: app: grafana spec: + securityContext: + runAsNonRoot: true + runAsUser: 65534 containers: - name: grafana - image: grafana/grafana:4.6.3 + image: quay.io/coreos/monitoring-grafana:4.6.3-non-root env: - name: GF_AUTH_BASIC_ENABLED value: "true" @@ -29,7 +32,7 @@ spec: key: password volumeMounts: - name: grafana-storage - mountPath: /var/grafana-storage + mountPath: /data ports: - name: web containerPort: 3000 diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index 250398bd..f92113e8 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -14,6 +14,9 @@ spec: name: node-exporter spec: serviceAccountName: node-exporter + securityContext: + runAsNonRoot: true + runAsUser: 65534 hostNetwork: true hostPID: true containers: From 642df8ac25625bf3e243855e7e661e228b5bdd2c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 26 Jan 2018 12:03:26 +0100 Subject: [PATCH 187/638] kube-prometheus: Fix generate scripts --- hack/scripts/generate-dashboards-configmap.sh | 2 +- hack/scripts/wrap-dashboard.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index 61140317..8cd76adc 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -36,7 +36,7 @@ for f in assets/grafana/generated/*-dashboard.json do basefilename=$(basename $f) echo " $basefilename: |+" - if [ "$basefilename" -eq "etcd-dashboard.json" ]; then + if [ "$basefilename" = "etcd-dashboard.json" ]; then hack/scripts/wrap-dashboard.sh $f prometheus-etcd | sed "s/^/ /g" else hack/scripts/wrap-dashboard.sh $f prometheus-k8s | sed "s/^/ /g" diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh index 02b1c531..a2ac854c 100755 --- a/hack/scripts/wrap-dashboard.sh +++ b/hack/scripts/wrap-dashboard.sh @@ -24,7 +24,7 @@ dashboardjson=$1 datasource_name=$2 inputname="DS_PROMETHEUS" -if [ "$datasource_name" -eq "prometheus-etcd" ]; then +if [ "$datasource_name" = "prometheus-etcd" ]; then $inputname="DS_PROMETHEUS-ETCD" fi From 14d20c140ea4e6101997f4083d4f5e02ff974094 Mon Sep 17 00:00:00 2001 From: Scott Brenner Date: Tue, 30 Jan 2018 13:09:50 -0800 Subject: [PATCH 188/638] Minor typo fix (#939) Promnetheus -> Prometheus --- docs/Monitoring external etcd.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Monitoring external etcd.md b/docs/Monitoring external etcd.md index 15e5632e..2bb79698 100644 --- a/docs/Monitoring external etcd.md +++ b/docs/Monitoring external etcd.md @@ -13,7 +13,7 @@ Prometheus Operator allows us to mount secrets in the pod. By loading the secret where CREDENTIAL_PATH is the path to your etcd client credentials on your work machine. (Kube-aws stores them inside the credential folder). -## b - Get Promnetheus Operator to load the secret +## b - Get Prometheus Operator to load the secret In the previous step we have named the secret 'etcd-certs'. Edit prometheus-operator/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml and add the secret under the spec of the Prometheus object manifest: @@ -167,4 +167,4 @@ Once you are happy with the dashboard, export it and move it to `prometheus-oper ### Reload the manifest in Kubernetes: ` kubectl -n monitoring replace -f manifests/grafana/grafana-dashboards.yaml` -After a few minutes your dasboard will be available permanently to all Grafana instances \ No newline at end of file +After a few minutes your dasboard will be available permanently to all Grafana instances From aafb48c4db828b0474d704d3c9c3872cc7695c91 Mon Sep 17 00:00:00 2001 From: ludek_navratil Date: Tue, 30 Jan 2018 22:53:26 +0000 Subject: [PATCH 189/638] fixing inputname var assignment --- hack/scripts/wrap-dashboard.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh index a2ac854c..02e9ee42 100755 --- a/hack/scripts/wrap-dashboard.sh +++ b/hack/scripts/wrap-dashboard.sh @@ -25,7 +25,7 @@ datasource_name=$2 inputname="DS_PROMETHEUS" if [ "$datasource_name" = "prometheus-etcd" ]; then - $inputname="DS_PROMETHEUS-ETCD" + inputname="DS_PROMETHEUS-ETCD" fi cat < Date: Wed, 31 Jan 2018 19:48:28 +0000 Subject: [PATCH 190/638] fixing#947 --- hack/scripts/generate-dashboards-configmap.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index 8cd76adc..c023874e 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -15,14 +15,9 @@ do rm -rf $f done -for f in assets/grafana/generated/*-datasource.json -do - rm -rf $f -done - -virtualenv -p python3 .env -source .env/bin/activate -pip install -Ur requirements.txt +virtualenv -p python3 .env 2>&1 > /dev/null +source .env/bin/activate 2>&1 > /dev/null +pip install -Ur requirements.txt 2>&1 > /dev/null for f in assets/grafana/*.dashboard.py do basefilename=$(basename $f) From 57cb22be7ad7b0bdf700e5ae9e54428e4aa1f9f1 Mon Sep 17 00:00:00 2001 From: gitfrederic Date: Sat, 3 Feb 2018 14:22:42 -0800 Subject: [PATCH 191/638] Fix for issue #924 As mentioned in https://github.com/coreos/prometheus-operator/issues/924#issuecomment-362859346 . --- grafana-image/config.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/grafana-image/config.toml b/grafana-image/config.toml index 7ed992c6..74f5d78b 100644 --- a/grafana-image/config.toml +++ b/grafana-image/config.toml @@ -1,2 +1,7 @@ [database] path = /data/grafana.db + +[paths] +data = /data +logs = /data/log +plugins = /data/plugins From ba5192bcaf8665821d7d8d8cb95e57cd584bed29 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 4 Feb 2018 16:12:54 +0100 Subject: [PATCH 192/638] kube-prometheus: Update Grafana image --- grafana-image/Makefile | 2 +- .../templates/grafana-deployment-template.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grafana-image/Makefile b/grafana-image/Makefile index 12fc7b81..f054a054 100644 --- a/grafana-image/Makefile +++ b/grafana-image/Makefile @@ -1,2 +1,2 @@ container: - docker build . -t quay.io/coreos/monitoring-grafana:4.6.3-non-root + docker build . -t quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index 091d4e80..a607f353 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:4.6.3-non-root + image: quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index d1b7c806..804657a0 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:4.6.3-non-root + image: quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" From 37f7fd52e87c7f8ba651451f70f7e465a7784d0b Mon Sep 17 00:00:00 2001 From: ludek_navratil Date: Thu, 8 Feb 2018 17:21:44 +0000 Subject: [PATCH 193/638] fixing#972 --- hack/scripts/generate-dashboards-configmap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index c023874e..0c2c9b7a 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -34,7 +34,7 @@ do if [ "$basefilename" = "etcd-dashboard.json" ]; then hack/scripts/wrap-dashboard.sh $f prometheus-etcd | sed "s/^/ /g" else - hack/scripts/wrap-dashboard.sh $f prometheus-k8s | sed "s/^/ /g" + hack/scripts/wrap-dashboard.sh $f prometheus | sed "s/^/ /g" fi done From 51515b1e2b3aba53abf248b3ee6774be84802f1a Mon Sep 17 00:00:00 2001 From: Scott Brenner Date: Thu, 8 Feb 2018 12:50:19 -0800 Subject: [PATCH 194/638] Update wrap-dashboard.sh Might as well, right? --- hack/scripts/wrap-dashboard.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh index 02e9ee42..d3b04085 100755 --- a/hack/scripts/wrap-dashboard.sh +++ b/hack/scripts/wrap-dashboard.sh @@ -4,8 +4,8 @@ # * Edit dashboard in Grafana (you need to login first with admin/admin # login/password). # * Save dashboard in Grafana to check is specification is correct. -# Looks like this is the only way to check is dashboard specification -# has error. +# Looks like this is the only way to check if dashboard specification +# has errors. # * Download dashboard specification as JSON file in Grafana: # Share -> Export -> Save to file. # * Drop dashboard specification in assets folder: From 85f88025f356e00c72111fd07b5b534c9088fa3f Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 7 Feb 2018 10:09:41 +0100 Subject: [PATCH 195/638] kube-prometheus: Upgrade to grafana v5 --- assets/grafana/_grafanalib.py | 6 +- assets/grafana/deployment.dashboard.py | 4 +- .../kubernetes-capacity-planning.dashboard.py | 22 +- .../kubernetes-cluster-health.dashboard.py | 18 +- ...bernetes-control-plane-status.dashboard.py | 16 +- .../kubernetes-resource-requests.dashboard.py | 10 +- assets/grafana/nodes.dashboard.py | 20 +- assets/grafana/pods.dashboard.py | 14 +- .../raw-json-dashboards/etcd-dashboard.json | 26 +- assets/grafana/statefulset.dashboard.py | 4 +- grafana-image/Dockerfile | 10 +- grafana-image/Makefile | 5 +- grafana-image/{config.toml => config.ini} | 9 + .../bin/grafana_dashboards_generate.sh | 9 +- .../templates/ConfigMap.header | 2 +- .../templates/dashboard.foot | 11 - .../templates/dashboard.header | 2 - .../grafana-deployment-template.yaml | 53 +- hack/scripts/generate-dashboards-configmap.sh | 9 +- hack/scripts/generate-manifests.sh | 6 +- .../grafana-dashboard-definitions.yaml | 7360 ++++++++++++++++ manifests/grafana/grafana-dashboards.yaml | 7502 +---------------- manifests/grafana/grafana-datasources.yaml | 15 + manifests/grafana/grafana-deployment.yaml | 59 +- 24 files changed, 7510 insertions(+), 7682 deletions(-) rename grafana-image/{config.toml => config.ini} (51%) create mode 100644 manifests/grafana/grafana-dashboard-definitions.yaml create mode 100644 manifests/grafana/grafana-datasources.yaml diff --git a/assets/grafana/_grafanalib.py b/assets/grafana/_grafanalib.py index e1e121e5..b304809d 100644 --- a/assets/grafana/_grafanalib.py +++ b/assets/grafana/_grafanalib.py @@ -13,7 +13,7 @@ def Dashboard( title=title, refresh=None, schemaVersion=14, version=version, time=time, timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -54,7 +54,7 @@ def SingleStat( return core.SingleStat( title=title, id=id, colorValue=colorValue, - dataSource='${DS_PROMETHEUS}', gauge=gauge, + dataSource='prometheus', gauge=gauge, valueFontSize=valueFontSize, thresholds=thresholds, valueName=valueName, valueMaps=valueMaps, rangeMaps=rangeMaps, mappingTypes=mappingTypes, targets=targets, @@ -81,7 +81,7 @@ def Graph( return core.Graph( id=id, title=title, dashLength=dashLength, dashes=dashes, spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes, - dataSource='${DS_PROMETHEUS}', nullPointMode=nullPointMode, editable=False, + dataSource='prometheus', nullPointMode=nullPointMode, editable=False, ) diff --git a/assets/grafana/deployment.dashboard.py b/assets/grafana/deployment.dashboard.py index 4a2bda46..6cecd4bf 100644 --- a/assets/grafana/deployment.dashboard.py +++ b/assets/grafana/deployment.dashboard.py @@ -13,7 +13,7 @@ dashboard = Dashboard( { 'allValue': '.*', 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': 'Namespace', @@ -34,7 +34,7 @@ dashboard = Dashboard( { 'allValue': None, 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': 'Deployment', diff --git a/assets/grafana/kubernetes-capacity-planning.dashboard.py b/assets/grafana/kubernetes-capacity-planning.dashboard.py index 00e5ada2..9b02010a 100644 --- a/assets/grafana/kubernetes-capacity-planning.dashboard.py +++ b/assets/grafana/kubernetes-capacity-planning.dashboard.py @@ -13,7 +13,7 @@ dashboard = Dashboard( timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -28,7 +28,7 @@ dashboard = Dashboard( Graph( title='Idle CPU', id=3, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', dashLength=10, dashes=False, isNew=False, @@ -55,7 +55,7 @@ dashboard = Dashboard( Graph( title='System Load', id=9, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', dashLength=10, dashes=False, isNew=False, @@ -102,7 +102,7 @@ dashboard = Dashboard( Graph( title='Memory Usage', id=4, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', dashLength=10, dashes=False, isNew=False, @@ -170,7 +170,7 @@ dashboard = Dashboard( ), SingleStat( title='Memory Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=5, format='percent', span=3, @@ -212,7 +212,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Disk I/O', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=6, dashLength=10, dashes=False, @@ -267,7 +267,7 @@ dashboard = Dashboard( ), SingleStat( title='Disk Space Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=12, span=3, editable=False, @@ -312,7 +312,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Network Received', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=8, dashLength=10, dashes=False, @@ -346,7 +346,7 @@ dashboard = Dashboard( ), Graph( title='Network Transmitted', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=10, dashLength=10, dashes=False, @@ -386,7 +386,7 @@ dashboard = Dashboard( panels=[ Graph( title='Cluster Pod Utilization', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=11, span=9, dashes=False, @@ -421,7 +421,7 @@ dashboard = Dashboard( ), SingleStat( title='Pod Utilization', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=7, editable=False, span=3, diff --git a/assets/grafana/kubernetes-cluster-health.dashboard.py b/assets/grafana/kubernetes-cluster-health.dashboard.py index dbd402f3..7f1cfe64 100644 --- a/assets/grafana/kubernetes-cluster-health.dashboard.py +++ b/assets/grafana/kubernetes-cluster-health.dashboard.py @@ -11,7 +11,7 @@ dashboard = Dashboard( timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -26,7 +26,7 @@ dashboard = Dashboard( SingleStat( title='Control Plane Components Down', id=1, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), span=3, thresholds='1, 3', @@ -76,7 +76,7 @@ dashboard = Dashboard( SingleStat( title='Alerts Firing', id=2, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -122,7 +122,7 @@ dashboard = Dashboard( SingleStat( title='Alerts Pending', id=3, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -168,7 +168,7 @@ dashboard = Dashboard( SingleStat( title='Crashlooping Pods', id=4, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -219,7 +219,7 @@ dashboard = Dashboard( SingleStat( title='Node Not Ready', id=5, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -265,7 +265,7 @@ dashboard = Dashboard( SingleStat( title='Node Disk Pressure', id=6, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -311,7 +311,7 @@ dashboard = Dashboard( SingleStat( title='Node Memory Pressure', id=7, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, @@ -357,7 +357,7 @@ dashboard = Dashboard( SingleStat( title='Nodes Unschedulable', id=8, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', gauge=Gauge(), colorValue=True, editable=False, diff --git a/assets/grafana/kubernetes-control-plane-status.dashboard.py b/assets/grafana/kubernetes-control-plane-status.dashboard.py index fad157f5..d2f35129 100644 --- a/assets/grafana/kubernetes-control-plane-status.dashboard.py +++ b/assets/grafana/kubernetes-control-plane-status.dashboard.py @@ -11,7 +11,7 @@ dashboard = Dashboard( editable=False, inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -25,7 +25,7 @@ dashboard = Dashboard( panels=[ SingleStat( title='API Servers UP', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', format='percent', editable=False, gauge=Gauge( @@ -76,7 +76,7 @@ dashboard = Dashboard( ), SingleStat( title='Controller Managers UP', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', format='percent', editable=False, gauge=Gauge( @@ -128,7 +128,7 @@ dashboard = Dashboard( ), SingleStat( title='Schedulers UP', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', format='percent', editable=False, gauge=Gauge( @@ -179,7 +179,7 @@ dashboard = Dashboard( ), SingleStat( title='API Server Request Error Rate', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', format='percent', editable=False, gauge=Gauge( @@ -234,7 +234,7 @@ dashboard = Dashboard( Graph( title='API Server Request Latency', id=7, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', dashLength=10, dashes=False, isNew=False, @@ -269,7 +269,7 @@ dashboard = Dashboard( Graph( title='End to End Scheduling Latency', id=5, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', isNew=False, editable=False, dashLength=10, @@ -300,7 +300,7 @@ dashboard = Dashboard( Graph( title='API Server Request Rates', id=6, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', isNew=False, editable=False, dashLength=10, diff --git a/assets/grafana/kubernetes-resource-requests.dashboard.py b/assets/grafana/kubernetes-resource-requests.dashboard.py index 6e4c510c..5d5b3bd8 100644 --- a/assets/grafana/kubernetes-resource-requests.dashboard.py +++ b/assets/grafana/kubernetes-resource-requests.dashboard.py @@ -12,7 +12,7 @@ dashboard = Dashboard( timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -34,7 +34,7 @@ dashboard = Dashboard( 'community/blob/master/contributors/design-proposals/' 'node-allocatable.md) is also shown.', id=1, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', dashLength=10, dashes=False, isNew=False, @@ -73,7 +73,7 @@ dashboard = Dashboard( ), SingleStat( title='CPU Cores', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=2, format='percent', editable=False, @@ -117,7 +117,7 @@ dashboard = Dashboard( Graph( title='Memory', id=3, - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', description='This represents the total [memory resource ' 'requests](https://kubernetes.io/docs/concepts/' 'configuration/manage-compute-resources-container/' @@ -163,7 +163,7 @@ dashboard = Dashboard( ), SingleStat( title='Memory', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=4, format='percent', span=3, diff --git a/assets/grafana/nodes.dashboard.py b/assets/grafana/nodes.dashboard.py index 89d2b1f9..da7b7d24 100644 --- a/assets/grafana/nodes.dashboard.py +++ b/assets/grafana/nodes.dashboard.py @@ -14,7 +14,7 @@ dashboard = Dashboard( timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -26,7 +26,7 @@ dashboard = Dashboard( { 'allValue': None, 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': None, @@ -50,7 +50,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Idle CPU', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=3, isNew=False, editable=False, @@ -82,7 +82,7 @@ dashboard = Dashboard( ), Graph( title='System Load', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=9, isNew=False, editable=False, @@ -129,7 +129,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Memory Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=4, isNew=False, editable=False, @@ -198,7 +198,7 @@ dashboard = Dashboard( ), SingleStat( title='Memory Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=5, format='percent', gauge=Gauge(show=True), @@ -241,7 +241,7 @@ dashboard = Dashboard( showTitle=False, panels=[ Graph( title='Disk I/O', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=6, dashLength=10, dashes=False, @@ -304,7 +304,7 @@ dashboard = Dashboard( ), SingleStat( title='Disk Space Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=7, thresholds='0.75, 0.9', editable=False, @@ -351,7 +351,7 @@ dashboard = Dashboard( panels=[ Graph( title='Network Received', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=8, dashLength=10, dashes=False, @@ -385,7 +385,7 @@ dashboard = Dashboard( ), Graph( title='Network Transmitted', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=10, dashLength=10, dashes=False, diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py index f5258cbc..84b3fdef 100644 --- a/assets/grafana/pods.dashboard.py +++ b/assets/grafana/pods.dashboard.py @@ -12,7 +12,7 @@ dashboard = Dashboard( timezone='browser', inputs=[ { - 'name': 'DS_PROMETHEUS', + 'name': 'prometheus', 'label': 'prometheus', 'description': '', 'type': 'datasource', @@ -24,7 +24,7 @@ dashboard = Dashboard( { 'allValue': '.*', 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': True, 'label': 'Namespace', @@ -44,7 +44,7 @@ dashboard = Dashboard( { 'allValue': None, 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': 'Pod', @@ -65,7 +65,7 @@ dashboard = Dashboard( { 'allValue': '.*', 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': True, 'label': 'Container', @@ -90,7 +90,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Memory Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=1, isNew=False, editable=False, @@ -155,7 +155,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='CPU Usage', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=2, isNew=False, editable=False, @@ -218,7 +218,7 @@ dashboard = Dashboard( titleSize='h6', panels=[ Graph( title='Network I/O', - dataSource='${DS_PROMETHEUS}', + dataSource='prometheus', id=3, isNew=False, editable=False, diff --git a/assets/grafana/raw-json-dashboards/etcd-dashboard.json b/assets/grafana/raw-json-dashboards/etcd-dashboard.json index 0098ffea..f2a03cec 100644 --- a/assets/grafana/raw-json-dashboards/etcd-dashboard.json +++ b/assets/grafana/raw-json-dashboards/etcd-dashboard.json @@ -1,7 +1,7 @@ { "__inputs": [ { - "name": "DS_PROMETHEUS", + "name": "prometheus", "label": "prometheus", "description": "", "type": "datasource", @@ -60,7 +60,7 @@ "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "format": "none", @@ -136,7 +136,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -225,7 +225,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -326,7 +326,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": null, "editable": false, "error": false, @@ -408,7 +408,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -497,7 +497,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -589,7 +589,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 5, @@ -669,7 +669,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 5, @@ -749,7 +749,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -829,7 +829,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": null, "editable": false, "error": false, @@ -923,7 +923,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "editable": false, "error": false, "fill": 0, @@ -1029,7 +1029,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": 0, "editable": false, "error": false, diff --git a/assets/grafana/statefulset.dashboard.py b/assets/grafana/statefulset.dashboard.py index a28d70fd..780630a2 100644 --- a/assets/grafana/statefulset.dashboard.py +++ b/assets/grafana/statefulset.dashboard.py @@ -13,7 +13,7 @@ dashboard = Dashboard( { 'allValue': '.*', 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': 'Namespace', @@ -34,7 +34,7 @@ dashboard = Dashboard( { 'allValue': None, 'current': {}, - 'datasource': '${DS_PROMETHEUS}', + 'datasource': 'prometheus', 'hide': 0, 'includeAll': False, 'label': 'StatefulSet', diff --git a/grafana-image/Dockerfile b/grafana-image/Dockerfile index bac01b59..fdbef289 100644 --- a/grafana-image/Dockerfile +++ b/grafana-image/Dockerfile @@ -1,15 +1,17 @@ FROM debian:9.3-slim +ARG GRAFANA_VERSION + RUN apt-get update && apt-get install -qq -y wget tar sqlite && \ - wget -O /tmp/grafana.tar.gz https://s3-us-west-2.amazonaws.com/grafana-releases/release/grafana-4.6.3.linux-x64.tar.gz && \ - tar -zxvf /tmp/grafana.tar.gz -C /tmp && mv /tmp/grafana-4.6.3 /grafana && \ + wget -O /tmp/grafana.tar.gz https://s3-us-west-2.amazonaws.com/grafana-releases/release/grafana-$GRAFANA_VERSION.linux-x64.tar.gz && \ + tar -zxvf /tmp/grafana.tar.gz -C /tmp && mv /tmp/grafana-$GRAFANA_VERSION /grafana && \ rm -rf /tmp/grafana.tar.gz -ADD config.toml /grafana/conf/config.toml +ADD config.ini /grafana/conf/config.ini USER nobody EXPOSE 3000 VOLUME [ "/data" ] WORKDIR /grafana ENTRYPOINT [ "/grafana/bin/grafana-server" ] -CMD [ "-config=/grafana/conf/config.toml" ] +CMD [ "-config=/grafana/conf/config.ini" ] diff --git a/grafana-image/Makefile b/grafana-image/Makefile index f054a054..9b352c17 100644 --- a/grafana-image/Makefile +++ b/grafana-image/Makefile @@ -1,2 +1,5 @@ +VERSION=5.0.0-beta1 +IMAGE_TAG=$(VERSION) + container: - docker build . -t quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 + docker build --build-arg GRAFANA_VERSION=$(VERSION) -t quay.io/coreos/monitoring-grafana:$(IMAGE_TAG) . diff --git a/grafana-image/config.toml b/grafana-image/config.ini similarity index 51% rename from grafana-image/config.toml rename to grafana-image/config.ini index 74f5d78b..14e0c7ad 100644 --- a/grafana-image/config.toml +++ b/grafana-image/config.ini @@ -5,3 +5,12 @@ path = /data/grafana.db data = /data logs = /data/log plugins = /data/plugins + +[session] +provider = memory + +[auth.basic] +enabled = false + +[auth.anonymous] +enabled = true diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index b4273baa..d89cfc97 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -70,7 +70,7 @@ DATE_EXEC="$(date "+%Y-%m-%d-%H%M%S")" BIN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" TOOL_HOME="$(dirname $BIN_DIR)" SCRIPT_BASE=`basename $0 | sed "s/\.[Ss][Hh]//"` -CONFIGMAP_DASHBOARD_PREFIX="grafana-dashboards" +CONFIGMAP_DASHBOARD_PREFIX="grafana-dashboard-definitions" TEMPLATES_DIR="$TOOL_HOME/templates" DASHBOARD_HEADER_FILE="$TEMPLATES_DIR/dashboard.header" @@ -327,7 +327,10 @@ initialize-bin-pack bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-dashboard.json" | sort)" # Continue processing datasources (maintaining the same queue) -bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-datasource.json" | sort )" +# +# Commented out, as datasources are provisionable by Grafana by default in Grafana v5, but from a separate directory, meaning a separate ConfigMap for us. +# +# bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-datasource.json" | sort )" # Processing remaining data in the queue (or unique) if [ "$to_process" ]; then @@ -361,7 +364,7 @@ for (( i=0; i<$total_configmaps_created; i++ )); do configmap="$CONFIGMAP_DASHBOARD_PREFIX-$i" echo "# Preparing grafana deployment to support configmap: $configmap" - test "$VOLUME_MOUNTS" && VOLUME_MOUNTS="$VOLUME_MOUNTS\n- name: $configmap\n mountPath: /var/$configmap" || VOLUME_MOUNTS="- name: $configmap\n mountPath: /var/$configmap" + test "$VOLUME_MOUNTS" && VOLUME_MOUNTS="$VOLUME_MOUNTS\n- name: $configmap\n mountPath: /grafana-dashboard-definitions/$i" || VOLUME_MOUNTS="- name: $configmap\n mountPath: /grafana-dashboard-definitions/$i" test "$VOLUMES" && VOLUMES="$VOLUMES\n- name: $configmap\n configMap:\n name: $configmap" || VOLUMES="- name: $configmap\n configMap:\n name: $configmap" test "$WATCH_DIR" && WATCH_DIR="$WATCH_DIR\n- '--watch-dir=/var/$configmap'" || WATCH_DIR="- '--watch-dir=/var/$configmap'" # echo "DEBUG:" diff --git a/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header b/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header index afc1f42c..73a14b05 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header +++ b/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header @@ -1,5 +1,5 @@ apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboards + name: grafana-dashboard-definitions data: diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot b/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot index 81fe9f6f..e69de29b 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot +++ b/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot @@ -1,11 +0,0 @@ - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.header b/hack/grafana-dashboards-configmap-generator/templates/dashboard.header index 807e5d38..e69de29b 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/dashboard.header +++ b/hack/grafana-dashboards-configmap-generator/templates/dashboard.header @@ -1,2 +0,0 @@ - { - "dashboard": diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index a607f353..fae45b25 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -14,25 +14,15 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 - env: - - name: GF_AUTH_BASIC_ENABLED - value: "true" - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "true" - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password + image: quay.io/coreos/monitoring-grafana:5.0.0-beta1 volumeMounts: - name: grafana-storage mountPath: /data + - name: grafana-datasources + mountPath: /grafana/conf/provisioning/datasources + - name: grafana-dashboards + mountPath: /grafana/conf/provisioning/dashboards +XXX_VOLUME_MOUNTS_XXX ports: - name: web containerPort: 3000 @@ -43,32 +33,13 @@ spec: limits: memory: 200Mi cpu: 200m - - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.8 - args: -XXX_WATCH_DIR_XXX - - '--grafana-url=http://localhost:3000' - env: - - name: GRAFANA_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GRAFANA_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password - resources: - requests: - memory: "16Mi" - cpu: "50m" - limits: - memory: "32Mi" - cpu: "100m" - volumeMounts: -XXX_VOLUME_MOUNTS_XXX volumes: - name: grafana-storage emptyDir: {} + - name: grafana-datasources + configMap: + name: grafana-datasources + - name: grafana-dashboards + configMap: + name: grafana-dashboards XXX_VOLUMES_XXX diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh index c023874e..81c157e7 100755 --- a/hack/scripts/generate-dashboards-configmap.sh +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -6,7 +6,7 @@ cat <<-EOF apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboards-0 + name: grafana-dashboard-definitions-0 data: EOF @@ -37,10 +37,3 @@ do hack/scripts/wrap-dashboard.sh $f prometheus-k8s | sed "s/^/ /g" fi done - -for f in assets/grafana/*-datasource.json -do - cp $f assets/grafana/generated/ - echo " $(basename $f): |+" - cat $f | sed "s/^/ /g" -done diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index 4826864c..b6f35a2d 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -6,16 +6,16 @@ set +x hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap -hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml +hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboard-definitions.yaml # Generate Dashboard ConfigMap with configmap-generator tool # Max Size per ConfigMap: 240000 # Input dir: assets/grafana # output file: manifests/grafana/grafana-dashboards.yaml # grafana deployment output file: manifests/grafana/grafana-deployment.yaml -test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml +test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboard-definitions.yaml test -f manifests/grafana/grafana-deployment.yaml && rm -f manifests/grafana/grafana-deployment.yaml -hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboards.yaml -g manifests/grafana/grafana-deployment.yaml +hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboard-definitions.yaml -g manifests/grafana/grafana-deployment.yaml # Generate Grafana Credentials Secret hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests/grafana/grafana-dashboard-definitions.yaml new file mode 100644 index 00000000..dbd2a30d --- /dev/null +++ b/manifests/grafana/grafana-dashboard-definitions.yaml @@ -0,0 +1,7360 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-definitions-0 +data: + deployment-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "200px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "100px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 1 + } + etcd-dashboard.json: |+ + { + "__inputs": [ + { + "name": "prometheus", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.5.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "etcd sample Grafana dashboard with Prometheus", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 28, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(etcd_server_has_leader)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "etcd_server_has_leader", + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "Up", + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Rate", + "metric": "grpc_server_started_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Failed Rate", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 41, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Watch Streams", + "metric": "grpc_server_handled_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Lease Streams", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Streams", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB Size", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "DB Size", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} WAL fsync", + "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", + "refId": "A", + "step": 4 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB fsync", + "metric": "etcd_disk_backend_commit_duration_seconds_bucket", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk Sync Duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Resident Memory", + "metric": "process_resident_memory_bytes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 5, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic In", + "metric": "etcd_network_client_grpc_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 5, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic Out", + "metric": "etcd_network_client_grpc_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic In", + "metric": "etcd_network_peer_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic Out", + "metric": "etcd_network_peer_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 40, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Failure Rate", + "metric": "etcd_server_proposals_failed_total", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(etcd_server_proposals_pending)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Pending Total", + "metric": "etcd_server_proposals_pending", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Commit Rate", + "metric": "etcd_server_proposals_committed_total", + "refId": "C", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Apply Rate", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Proposals", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": false, + "error": false, + "fill": 0, + "id": 19, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "changes(etcd_server_leader_changes_seen_total[1d])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Total Leader Elections Per Day", + "metric": "etcd_server_leader_changes_seen_total", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total Leader Elections Per Day", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "etcd", + "version": 4 + } + kubernetes-capacity-planning-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "title": "Idle CPU", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 9, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 10, + "target": "" + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "246px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "ms", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 12, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk Space Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 8, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "title": "Network Received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 10, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "title": "Network Transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "276px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 11, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 11, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_info)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current number of Pods", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_node_status_capacity_pods)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Maximum capacity of pods", + "refId": "B", + "step": 10 + } + ], + "title": "Cluster Pod Utilization", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Pod Utilization", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Capacity Planning", + "version": 4 + } + kubernetes-cluster-health-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "editable": false, + "height": "254px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Control Plane Components Down", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "Everything UP and healthy", + "value": "null" + }, + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Alerts Firing", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3, 5", + "title": "Alerts Pending", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Crashlooping Pods", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Not Ready", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Disk Pressure", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Memory Pressure", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Nodes Unschedulable", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Health", + "version": 9 + } + kubernetes-cluster-status-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "129px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Control Plane UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "UP", + "value": "null" + } + ], + "valueName": "total" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "3, 5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": true, + "title": "Cluster Health", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "168px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Controller Managers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Crashlooping Control Plane Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": true, + "title": "Control Plane Status", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "158px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "CPU Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Memory Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Filesystem Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 10, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": true, + "title": "Capacity Planning", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Status", + "version": 3 + } + kubernetes-control-plane-status-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Controller Managers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Schedulers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "5, 10", + "title": "API Server Request Error Rate", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 7, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 30 + } + ], + "title": "API Server Request Latency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 5, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 60 + } + ], + "title": "End to End Scheduling Latency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "dtdurations", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Error Rate", + "refId": "A", + "step": 60 + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B", + "step": 60 + } + ], + "title": "API Server Request Rates", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Control Plane Status", + "version": 3 + } + kubernetes-resource-requests-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "300px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 20 + } + ], + "title": "CPU Cores", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "transparent": false, + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "300px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 20 + } + ], + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "transparent": false, + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 2 + } + nodes-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": false, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "title": "Idle CPU", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 9, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 10 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 10 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 10 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 10 + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "ms", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk Space Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 8, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "title": "Network Received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 10, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "title": "Network Transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 2 + } + pods-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_pod_container_resource_requests_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_cpu_cores", + "refId": "B", + "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 + } + ], + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 1 + } + statefulset-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "200px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "100px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_statefulset_replicas", + "refId": "A", + "step": 600 + } + ], + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "statefulset_namespace", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "StatefulSet", + "multi": false, + "name": "statefulset_name", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "statefulset", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "StatefulSet", + "version": 1 + } +--- diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index 82428564..ed3fe5fe 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1,7498 +1,12 @@ apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboards-0 + name: grafana-dashboards data: - deployment-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "200px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "100px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "deployment", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 1 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - etcd-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.5.2" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "etcd sample Grafana dashboard with Prometheus", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 28, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(etcd_server_has_leader)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "etcd_server_has_leader", - "refId": "A", - "step": 20 - } - ], - "thresholds": "", - "title": "Up", - "type": "singlestat", - "valueFontSize": "200%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Rate", - "metric": "grpc_server_started_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Failed Rate", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "id": 41, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Watch Streams", - "metric": "grpc_server_handled_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Lease Streams", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Active Streams", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB Size", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "DB Size", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 1, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} WAL fsync", - "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB fsync", - "metric": "etcd_disk_backend_commit_duration_seconds_bucket", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Sync Duration", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "id": 29, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Resident Memory", - "metric": "process_resident_memory_bytes", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 5, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic In", - "metric": "etcd_network_client_grpc_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 5, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic Out", - "metric": "etcd_network_client_grpc_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic In", - "metric": "etcd_network_peer_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic Out", - "metric": "etcd_network_peer_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 0, - "id": 40, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Failure Rate", - "metric": "etcd_server_proposals_failed_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(etcd_server_proposals_pending)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Pending Total", - "metric": "etcd_server_proposals_pending", - "refId": "B", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Commit Rate", - "metric": "etcd_server_proposals_committed_total", - "refId": "C", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Apply Rate", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Raft Proposals", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "editable": false, - "error": false, - "fill": 0, - "id": 19, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "changes(etcd_server_leader_changes_seen_total[1d])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Total Leader Elections Per Day", - "metric": "etcd_server_leader_changes_seen_total", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total Leader Elections Per Day", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "etcd", - "version": 4 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - kubernetes-capacity-planning-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "title": "Idle CPU", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 9, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "title": "System Load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 4, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 10, - "target": "" - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "246px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "ms", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 12, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk Space Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 8, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "title": "Network Received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 10, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "title": "Network Transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "276px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 11, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 11, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_info)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current number of Pods", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_node_status_capacity_pods)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Maximum capacity of pods", - "refId": "B", - "step": 10 - } - ], - "title": "Cluster Pod Utilization", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Capacity Planning", - "version": 4 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - kubernetes-cluster-health-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "editable": false, - "height": "254px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Control Plane Components Down", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "Everything UP and healthy", - "value": "null" - }, - { - "op": "=", - "text": "", - "value": "" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Alerts Firing", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "3, 5", - "title": "Alerts Pending", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Crashlooping Pods", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Not Ready", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Disk Pressure", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Memory Pressure", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_spec_unschedulable)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Nodes Unschedulable", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Health", - "version": 9 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - kubernetes-cluster-status-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "129px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Control Plane UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "UP", - "value": "null" - } - ], - "valueName": "total" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "3, 5", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": true, - "title": "Cluster Health", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "168px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Controller Managers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Schedulers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Crashlooping Control Plane Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": true, - "title": "Control Plane Status", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "158px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "CPU Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Memory Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Filesystem Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 10, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": true, - "title": "Capacity Planning", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Status", - "version": 3 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - kubernetes-control-plane-status-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Controller Managers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Schedulers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "5, 10", - "title": "API Server Request Error Rate", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 7, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 30 - } - ], - "title": "API Server Request Latency", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 5, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 60 - } - ], - "title": "End to End Scheduling Latency", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "dtdurations", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Error Rate", - "refId": "A", - "step": 60 - }, - { - "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Request Rate", - "refId": "B", - "step": 60 - } - ], - "title": "API Server Request Rates", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Control Plane Status", - "version": 3 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - kubernetes-resource-requests-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "300px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 20 - } - ], - "title": "CPU Cores", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "transparent": false, - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "300px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 20 - } - ], - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "transparent": false, - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Resource Requests", - "version": 2 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - nodes-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": false, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "title": "Idle CPU", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 9, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "title": "System Load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 4, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 10 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 10 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 10 - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "ms", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk Space Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 8, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "title": "Network Received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 10, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "title": "Network Transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 2 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - pods-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 15 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - }, - { - "expr": "kube_pod_container_resource_limits_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "metric": "kube_pod_container_resource_limits_memory_bytes", - "refId": "C", - "step": 20 - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - }, - { - "expr": "kube_pod_container_resource_requests_cpu_cores{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_cpu_cores", - "refId": "B", - "step": 20 - }, - { - "expr": "kube_pod_container_resource_limits_cpu_cores{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "metric": "kube_pod_container_resource_limits_memory_bytes", - "refId": "C", - "step": 20 - } - ], - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 1 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - statefulset-dashboard.json: |+ - { - "dashboard": - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "200px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "100px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_statefulset_replicas", - "refId": "A", - "step": 600 - } - ], - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "statefulset_namespace", - "options": [], - "query": "label_values(kube_statefulset_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "StatefulSet", - "multi": false, - "name": "statefulset_name", - "options": [], - "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "statefulset", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "StatefulSet", - "version": 1 - } - , - "inputs": [ - { - "name": "DS_PROMETHEUS", - "pluginId": "prometheus", - "type": "datasource", - "value": "prometheus" - } - ], - "overwrite": true - } - prometheus-datasource.json: |+ - { - "access": "proxy", - "basicAuth": false, - "name": "prometheus", - "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090" - } ---- + dashboards.yaml: |+ + - name: 'default' + org_id: 1 + folder: '' + type: file + options: + folder: /grafana-dashboard-definitions/0 diff --git a/manifests/grafana/grafana-datasources.yaml b/manifests/grafana/grafana-datasources.yaml new file mode 100644 index 00000000..33c3081f --- /dev/null +++ b/manifests/grafana/grafana-datasources.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources +data: + prometheus.yaml: |+ + datasources: + - name: prometheus + type: prometheus + access: proxy + org_id: 1 + url: http://prometheus-k8s.monitoring.svc:9090 + version: 1 + editable: false + diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 804657a0..a937b138 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -14,25 +14,16 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:4.6.3-non-root.1 - env: - - name: GF_AUTH_BASIC_ENABLED - value: "true" - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "true" - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password + image: quay.io/coreos/monitoring-grafana:5.0.0-beta1 volumeMounts: - name: grafana-storage mountPath: /data + - name: grafana-datasources + mountPath: /grafana/conf/provisioning/datasources + - name: grafana-dashboards + mountPath: /grafana/conf/provisioning/dashboards + - name: grafana-dashboard-definitions-0 + mountPath: /grafana-dashboard-definitions/0 ports: - name: web containerPort: 3000 @@ -43,35 +34,15 @@ spec: limits: memory: 200Mi cpu: 200m - - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.8 - args: - - '--watch-dir=/var/grafana-dashboards-0' - - '--grafana-url=http://localhost:3000' - env: - - name: GRAFANA_USER - valueFrom: - secretKeyRef: - name: grafana-credentials - key: user - - name: GRAFANA_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-credentials - key: password - resources: - requests: - memory: "16Mi" - cpu: "50m" - limits: - memory: "32Mi" - cpu: "100m" - volumeMounts: - - name: grafana-dashboards-0 - mountPath: /var/grafana-dashboards-0 volumes: - name: grafana-storage emptyDir: {} - - name: grafana-dashboards-0 + - name: grafana-datasources configMap: - name: grafana-dashboards-0 + name: grafana-datasources + - name: grafana-dashboards + configMap: + name: grafana-dashboards + - name: grafana-dashboard-definitions-0 + configMap: + name: grafana-dashboard-definitions-0 From e8a19234dc0672d1f5ea00f420931ab616f4e241 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 12 Feb 2018 11:49:10 +0100 Subject: [PATCH 196/638] kube-prometheus/hack: Add generation for grafana dashboard source file --- .../bin/grafana_dashboards_generate.sh | 24 +++++++++++++++++++ .../grafana-dashboards-template.yaml | 7 ++++++ hack/scripts/generate-manifests.sh | 5 ++-- manifests/grafana/grafana-dashboards.yaml | 2 +- 4 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh index d89cfc97..ee6b49e7 100755 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh @@ -77,11 +77,13 @@ DASHBOARD_HEADER_FILE="$TEMPLATES_DIR/dashboard.header" DASHBOARD_FOOT_FILE="$TEMPLATES_DIR/dashboard.foot" CONFIGMAP_HEADER="$TEMPLATES_DIR/ConfigMap.header" GRAFANA_DEPLOYMENT_TEMPLATE="$TEMPLATES_DIR/grafana-deployment-template.yaml" +GRAFANA_DASHBOARDS_TEMPLATE="$TEMPLATES_DIR/grafana-dashboards-template.yaml" OUTPUT_BASE_DIR="$TOOL_HOME/output" # Some default values OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-configMap-$DATE_EXEC.yaml" GRAFANA_OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-deployment-$DATE_EXEC.yaml" +GRAFANA_DASHBOARD_OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-$DATE_EXEC.yaml" DASHBOARDS_DIR="$TEMPLATES_DIR/grafana-dashboards" APPLY_CONFIGMAP="false" @@ -100,6 +102,10 @@ while (( "$#" )); do GRAFANA_OUTPUT_FILE="$2" shift ;; + "-d" | "--grafana-dashboard-output-file") + GRAFANA_DASHBOARD_OUTPUT_FILE="$2" + shift + ;; "-i" | "--input-dir") DASHBOARDS_DIR="$2" shift @@ -298,6 +304,7 @@ test -f "$DASHBOARD_FOOT_FILE" || { echo "Template $DASHBOARD_FOOT_FILE not foun test -f "$DASHBOARD_HEADER_FILE" || { echo "Template $DASHBOARD_HEADER_FILE not found"; exit 1; } test -f "$CONFIGMAP_HEADER" || { echo "Template $CONFIGMAP_HEADER not found"; exit 1; } test -f "$GRAFANA_DEPLOYMENT_TEMPLATE" || { echo "Template $GRAFANA_DEPLOYMENT_TEMPLATE not found"; exit 1; } +test -f "$GRAFANA_DASHBOARDS_TEMPLATE" || { echo "Template $GRAFANA_DEPLOYMENT_TEMPLATE not found"; exit 1; } test ! -d "$OUTPUT_BASE_DIR" && { echo "ERROR: missing directory $OUTPUT_BASE_DIR"; exit 1; } @@ -306,6 +313,7 @@ test -d "$DASHBOARDS_DIR" || { echo "ERROR: Dashboards directory not found: $DAS test -f "$OUTPUT_FILE" && { echo "ERROR: Output file already exists: $OUTPUT_FILE"; exit 1; } test -f "$GRAFANA_OUTPUT_FILE" && { echo "ERROR: Output file already exists: $GRAFANA_OUTPUT_FILE"; exit 1; } +test -f "$GRAFANA_DASHBOARD_OUTPUT_FILE" && { echo "ERROR: Output file already exists: $GRAFANA_DASHBOARD_OUTPUT_FILE"; exit 1; } touch $OUTPUT_FILE || { echo "ERROR: Unable to create or modify $OUTPUT_FILE"; exit 1; } touch $GRAFANA_OUTPUT_FILE || { echo "ERROR: Unable to create or modify $GRAFANA_OUTPUT_FILE"; exit 1; } @@ -380,6 +388,22 @@ sed -e "s#XXX_VOLUMES_XXX#$(indentMultiLineString 6 "$VOLUMES")#" \ -e "s#XXX_WATCH_DIR_XXX#$(indentMultiLineString 10 "$WATCH_DIR")#" \ $GRAFANA_DEPLOYMENT_TEMPLATE > $GRAFANA_OUTPUT_FILE +echo +echo "# Generating Grafana dashboard sources file for $total_configmaps_created directories" +DASHBOARD_SOURCES="" +for (( j=0; j<$total_configmaps_created; j++ )); do + echo "# Preparing grafana dashboards sources to support configmap: /grafana-dashboard-definitions/$j" + test "$DASHBOARD_SOURCES" && DASHBOARD_SOURCES="$DASHBOARD_SOURCES\n- name: '$j'\n org_id: 1\n folder: ''\n type: file\n options:\n folder: /grafana-dashboard-definitions/$j" || DASHBOARD_SOURCES="- name: '$j'\n org_id: 1\n folder: ''\n type: file\n options:\n folder: /grafana-dashboard-definitions/$j" + + # echo "DEBUG:" + # echo "DASHBOARD_SOURCES: $DASHBOARD_SOURCES" + echo +done + +echo "# Processing grafana dashboards template into $GRAFANA_DASHBOARD_OUTPUT_FILE" +sed -e "s#XXX_DASHBOARDS_XXX#$(indentMultiLineString 4 "$DASHBOARD_SOURCES")#" \ + $GRAFANA_DASHBOARDS_TEMPLATE > $GRAFANA_DASHBOARD_OUTPUT_FILE + # If output file is empty we can delete it and exit test ! -s "$OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $OUTPUT_FILE; exit 0; } test ! -s "$GRAFANA_OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $GRAFANA_OUTPUT_FILE; exit 0; } diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml new file mode 100644 index 00000000..a8b00982 --- /dev/null +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards +data: + dashboards.yaml: |+ +XXX_DASHBOARDS_XXX diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index b6f35a2d..6f14056b 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -13,9 +13,10 @@ hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashbo # Input dir: assets/grafana # output file: manifests/grafana/grafana-dashboards.yaml # grafana deployment output file: manifests/grafana/grafana-deployment.yaml -test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboard-definitions.yaml +test -f manifests/grafana/grafana-dashboard-definitions.yaml && rm -f manifests/grafana/grafana-dashboard-definitions.yaml test -f manifests/grafana/grafana-deployment.yaml && rm -f manifests/grafana/grafana-deployment.yaml -hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboard-definitions.yaml -g manifests/grafana/grafana-deployment.yaml +test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml +hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboard-definitions.yaml -g manifests/grafana/grafana-deployment.yaml -d manifests/grafana/grafana-dashboards.yaml # Generate Grafana Credentials Secret hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index ed3fe5fe..772d3f64 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -4,7 +4,7 @@ metadata: name: grafana-dashboards data: dashboards.yaml: |+ - - name: 'default' + - name: '0' org_id: 1 folder: '' type: file From 936622aa3e022b49abd9bfcb1c87dcb191bb2f6b Mon Sep 17 00:00:00 2001 From: Dan Mace Date: Tue, 13 Feb 2018 16:05:17 -0500 Subject: [PATCH 197/638] Add missing resource to operator role Add a missing 'prometheuses/finalizers' resource to the operator cluster role. Without this role, the operator will fail to create a default secret for a Prometheus instance in the absence of any service monitors. The fix seems to already be present in the `examples` version of the role; this patch brings the contrib version in line with the example. --- .../prometheus-operator/prometheus-operator-cluster-role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 0a78305b..809c9a70 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -20,6 +20,7 @@ rules: resources: - alertmanagers - prometheuses + - prometheuses/finalizers - servicemonitors verbs: - "*" From 863156dcc9e9f5fe1b1109eb4360c4e3231f4168 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 14 Feb 2018 13:43:39 +0100 Subject: [PATCH 198/638] *: bump Prometheus and Alertmanager versions --- manifests/alertmanager/alertmanager.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index 00651044..3157d587 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -6,4 +6,4 @@ metadata: alertmanager: main spec: replicas: 3 - version: v0.13.0 + version: v0.14.0 diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 401784fa..7ebe792f 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v2.1.0 + version: v2.2.0-rc.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 414b380cfa43d406a7b697d88527bf57569f92fa Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 14 Feb 2018 13:47:34 +0100 Subject: [PATCH 199/638] *: Bump Promteheus Operator version to v0.17.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 71af2d7d..73affb96 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -15,7 +15,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.16.1 + image: quay.io/coreos/prometheus-operator:v0.17.0 name: prometheus-operator ports: - containerPort: 8080 From 80b2a511be39dadc99da56c08d8b1e41f050d52c Mon Sep 17 00:00:00 2001 From: Dan Mace Date: Thu, 15 Feb 2018 16:12:54 -0500 Subject: [PATCH 200/638] Give operator permission to manager alertmanagers Add a missing resource to the operator's cluster role allowing it to correctly manager alertmanagers. --- .../prometheus-operator/prometheus-operator-cluster-role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 809c9a70..1b13c899 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -21,6 +21,7 @@ rules: - alertmanagers - prometheuses - prometheuses/finalizers + - alertmanagers/finalizers - servicemonitors verbs: - "*" From 1c3fb8462fb0b08ae367cfe27de86db508b2f0e4 Mon Sep 17 00:00:00 2001 From: jordanjennings Date: Fri, 16 Feb 2018 12:37:32 -0500 Subject: [PATCH 201/638] Update to grafana 5.0.0-beta5 --- grafana-image/Makefile | 2 +- .../templates/grafana-deployment-template.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grafana-image/Makefile b/grafana-image/Makefile index 9b352c17..a9ce738d 100644 --- a/grafana-image/Makefile +++ b/grafana-image/Makefile @@ -1,4 +1,4 @@ -VERSION=5.0.0-beta1 +VERSION=5.0.0-beta5 IMAGE_TAG=$(VERSION) container: diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index fae45b25..2c23b533 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0-beta1 + image: quay.io/coreos/monitoring-grafana:5.0.0-beta5 volumeMounts: - name: grafana-storage mountPath: /data diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a937b138..a894e1c9 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0-beta1 + image: quay.io/coreos/monitoring-grafana:5.0.0-beta5 volumeMounts: - name: grafana-storage mountPath: /data From 0ae6c98a4896e6b2211d39cb25017f0dbc3ad7a1 Mon Sep 17 00:00:00 2001 From: Antoine Legrand <2t.antoine@gmail.com> Date: Thu, 22 Feb 2018 13:46:27 +0100 Subject: [PATCH 202/638] Add alert if it no samples are ingested --- assets/prometheus/rules/prometheus.rules.yaml | 16 ++++++++++++++++ manifests/prometheus/prometheus-k8s-rules.yaml | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index e27aa281..e006ba9b 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -8,6 +8,7 @@ groups: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m @@ -16,6 +17,7 @@ groups: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 @@ -25,6 +27,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 @@ -34,6 +37,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m @@ -42,6 +46,7 @@ groups: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h @@ -51,6 +56,7 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h @@ -60,6 +66,7 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h @@ -69,3 +76,12 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted + + - alert: PrometheusNotIngestingSamples + expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 + for: 10m + labels: + severity: warning + annotations: + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." + summary: "Prometheus isn't ingesting samples" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index d563a571..f6b2b8f8 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -539,6 +539,7 @@ data: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m @@ -547,6 +548,7 @@ data: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 @@ -556,6 +558,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 @@ -565,6 +568,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m @@ -573,6 +577,7 @@ data: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h @@ -582,6 +587,7 @@ data: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h @@ -591,6 +597,7 @@ data: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h @@ -600,3 +607,12 @@ data: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted + + - alert: PrometheusNotIngestingSamples + expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 + for: 10m + labels: + severity: warning + annotations: + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." + summary: "Prometheus isn't ingesting samples" From eb636277cb5eab6186be91a0c1e3f8f100d06405 Mon Sep 17 00:00:00 2001 From: Jordan Jennings Date: Mon, 26 Feb 2018 10:11:56 -0500 Subject: [PATCH 203/638] Update to Prometheus v2.2.0-rc.1 --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 7ebe792f..6e7f239d 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v2.2.0-rc.0 + version: v2.2.0-rc.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 14d57e1fec8e21de757fb822ae6061dbc63b2c3c Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Date: Wed, 28 Feb 2018 13:36:28 -0500 Subject: [PATCH 204/638] Added information on webhook token authentication Added information on webhook token authentication and information on how to update the manifests if Kubernetes is already deployed with kubeadm. --- docs/kube-prometheus-on-kubeadm.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index fac211db..e1fe2da9 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -48,14 +48,22 @@ In addition, we will be using `node-exporter` to monitor the `cAdvisor` service > The kubeadm deb package ships with configuration for how the kubelet should be run. Note that the `kubeadm` CLI command will never touch this drop-in file. This drop-in file belongs to the kubeadm deb/rpm package. -Again, we need to expose the `cadvisor` that is installed and managed by the `kubelet` daemon. To do so, we do the following on all the masters and nodes: +Again, we need to expose the `cadvisor` that is installed and managed by the `kubelet` daemon and allow webhook token authentication. To do so, we do the following on all the masters and nodes: ``` sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" systemctl daemon-reload systemctl restart kubelet ``` +In case you already have a Kubernetes deployed with kubeadm, change the address kube-controller-manager and kube-scheduler listens in addition to previous kubelet change: + +``` +sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml +sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml +``` + With these changes, your Kubernetes cluster is ready. ## Metric Sources From 7fe4506ae477e9982e92ad6c2ece3093fce506ac Mon Sep 17 00:00:00 2001 From: Akihito INOH Date: Wed, 28 Feb 2018 14:52:34 +0900 Subject: [PATCH 205/638] Update alert rule for kubelet Update alert rule check kubelet down ratio from 1% to 10%. In #774 , it is changed to 1%, so returns to 10%. --- assets/prometheus/rules/kubelet.rules.yaml | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index a1fc93cb..0edd7878 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -28,7 +28,7 @@ groups: description: Prometheus failed to scrape {{ $value }}% of kubelets. - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) - * 100 > 1 + * 100 > 10 for: 1h labels: severity: critical diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index f6b2b8f8..c7cb14ac 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -365,7 +365,7 @@ data: description: Prometheus failed to scrape {{ $value }}% of kubelets. - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) - * 100 > 1 + * 100 > 10 for: 1h labels: severity: critical From c54468ab7b01dfdfae2c448304e7909a11c97236 Mon Sep 17 00:00:00 2001 From: Jordan Jennings Date: Thu, 1 Mar 2018 11:32:42 -0500 Subject: [PATCH 206/638] Update to grafana 5.0.0 --- grafana-image/Makefile | 2 +- .../templates/grafana-deployment-template.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grafana-image/Makefile b/grafana-image/Makefile index a9ce738d..a957a7e9 100644 --- a/grafana-image/Makefile +++ b/grafana-image/Makefile @@ -1,4 +1,4 @@ -VERSION=5.0.0-beta5 +VERSION=5.0.0 IMAGE_TAG=$(VERSION) container: diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index 2c23b533..c5fc633c 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0-beta5 + image: quay.io/coreos/monitoring-grafana:5.0.0 volumeMounts: - name: grafana-storage mountPath: /data diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a894e1c9..2c5fed77 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0-beta5 + image: quay.io/coreos/monitoring-grafana:5.0.0 volumeMounts: - name: grafana-storage mountPath: /data From 8b6ee5c18b9323888cc1c146beed819984656f92 Mon Sep 17 00:00:00 2001 From: Alexander Holte-Davidsen Date: Mon, 5 Mar 2018 09:52:51 +0100 Subject: [PATCH 207/638] Add summary to Alertmanager rules where missing - updated accoring to guidelines --- assets/prometheus/rules/alertmanager.rules.yaml | 3 +++ assets/prometheus/rules/kubelet.rules.yaml | 1 + assets/prometheus/rules/kubernetes.rules.yaml | 6 ++++++ assets/prometheus/rules/node.rules.yaml | 2 ++ assets/prometheus/rules/prometheus.rules.yaml | 5 +++++ manifests/prometheus/prometheus-k8s-rules.yaml | 17 +++++++++++++++++ 6 files changed, 34 insertions(+) diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml index fdfdfd0f..5e51f75b 100644 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -11,6 +11,7 @@ groups: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Configuration out of sync - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -20,6 +21,7 @@ groups: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or missing - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m @@ -28,3 +30,4 @@ groups: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 0edd7878..85547dd6 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -26,6 +26,7 @@ groups: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Prometheus failed to scrape - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10 diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index f961ce6b..288841b7 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -51,6 +51,7 @@ groups: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 @@ -60,6 +61,7 @@ groups: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2 @@ -68,6 +70,7 @@ groups: severity: warning annotations: description: API server returns errors for {{ $value }}% of requests + summary: API server request errors - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 @@ -84,12 +87,14 @@ groups: annotations: description: No API servers are reachable or all have disappeared from service discovery + summary: No API servers are reachable - alert: K8sCertificateExpirationNotice labels: severity: warning annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days) + summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - alert: K8sCertificateExpirationNotice @@ -97,4 +102,5 @@ groups: severity: critical annotations: description: Kubernetes API Certificate is expiring in less than 1 day + summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 0e7e1bbd..d14f0870 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -26,6 +26,7 @@ groups: annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery + summary: Prometheus could not scrape a node-exporter - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 for: 30m @@ -42,3 +43,4 @@ groups: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index e006ba9b..43f2808c 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -8,6 +8,7 @@ groups: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity @@ -17,6 +18,7 @@ groups: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + summary: Prometheus' alert notification queue is running full - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -27,6 +29,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -37,6 +40,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 @@ -46,6 +50,7 @@ groups: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index c7cb14ac..05368dc1 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -20,6 +20,7 @@ data: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Configuration out of sync - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -29,6 +30,7 @@ data: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or missing - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m @@ -37,6 +39,7 @@ data: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules @@ -363,6 +366,7 @@ data: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Prometheus failed to scrape - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10 @@ -436,6 +440,7 @@ data: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 @@ -445,6 +450,7 @@ data: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2 @@ -453,6 +459,7 @@ data: severity: warning annotations: description: API server returns errors for {{ $value }}% of requests + summary: API server request errors - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 @@ -469,12 +476,14 @@ data: annotations: description: No API servers are reachable or all have disappeared from service discovery + summary: No API servers are reachable - alert: K8sCertificateExpirationNotice labels: severity: warning annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days) + summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - alert: K8sCertificateExpirationNotice @@ -482,6 +491,7 @@ data: severity: critical annotations: description: Kubernetes API Certificate is expiring in less than 1 day + summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 node.rules.yaml: |+ groups: @@ -512,6 +522,7 @@ data: annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery + summary: Prometheus could not scrape a node-exporter - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 for: 30m @@ -528,6 +539,7 @@ data: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full prometheus.rules.yaml: |+ groups: - name: prometheus.rules @@ -539,6 +551,7 @@ data: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity @@ -548,6 +561,7 @@ data: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + summary: Prometheus' alert notification queue is running full - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -558,6 +572,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -568,6 +583,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 @@ -577,6 +593,7 @@ data: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 From 5d33cbb166e1b5155004152b0286e948fc526725 Mon Sep 17 00:00:00 2001 From: Francisco Ripoli Date: Thu, 8 Mar 2018 17:36:05 +0000 Subject: [PATCH 208/638] example service monitoring updated with missing config added service account, role and role binding for the prometheus frontend example, also updated prometheus to use the correct service account fixes #1049 --- hack/example-service-monitoring/deploy | 3 +++ .../prometheus-frontend-role-binding.yaml | 13 +++++++++++++ .../example-app/prometheus-frontend-role.yaml | 17 +++++++++++++++++ .../prometheus-frontend-service-account.yaml | 4 ++++ .../example-app/prometheus-frontend.yaml | 1 + 5 files changed, 38 insertions(+) create mode 100644 manifests/examples/example-app/prometheus-frontend-role-binding.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend-role.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend-service-account.yaml diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy index 7691047c..ba922b84 100755 --- a/hack/example-service-monitoring/deploy +++ b/hack/example-service-monitoring/deploy @@ -8,6 +8,9 @@ if [ -z "${NAMESPACE}" ]; then NAMESPACE=default fi +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-service-account.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-role.yaml +kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-role-binding.yaml kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-svc.yaml kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/example-app.yaml kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-role-binding.yaml b/manifests/examples/example-app/prometheus-frontend-role-binding.yaml new file mode 100644 index 00000000..1d6bea2c --- /dev/null +++ b/manifests/examples/example-app/prometheus-frontend-role-binding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-frontend + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-frontend +subjects: +- kind: ServiceAccount + name: prometheus-frontend + namespace: default diff --git a/manifests/examples/example-app/prometheus-frontend-role.yaml b/manifests/examples/example-app/prometheus-frontend-role.yaml new file mode 100644 index 00000000..79d50501 --- /dev/null +++ b/manifests/examples/example-app/prometheus-frontend-role.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-frontend + namespace: default +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] diff --git a/manifests/examples/example-app/prometheus-frontend-service-account.yaml b/manifests/examples/example-app/prometheus-frontend-service-account.yaml new file mode 100644 index 00000000..abd3e0df --- /dev/null +++ b/manifests/examples/example-app/prometheus-frontend-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-frontend diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index b55b58db..d5651529 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,6 +6,7 @@ metadata: labels: prometheus: frontend spec: + serviceAccountName: prometheus-frontend version: v1.7.1 serviceMonitorSelector: matchLabels: From f0a86796f03be5af375afd74e09765b12fc69148 Mon Sep 17 00:00:00 2001 From: Jesse Stuart Date: Sun, 11 Mar 2018 06:27:11 -0400 Subject: [PATCH 209/638] docs: Add missing argument in `kubeadm` workaround script. One of the `sed` commands for updating the kubeadm systemd file (`/etc/systemd/system/kubelet.service.d/10-kubeadm.conf`) was missing a `-i` reference to the file itself, causing it to hang indefinitely if ran as declared. I also wrapped this second `sed` in an `if grep ...` check, in order to make the operation idempotent. --- docs/kube-prometheus-on-kubeadm.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index e1fe2da9..78a499e0 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -50,9 +50,12 @@ In addition, we will be using `node-exporter` to monitor the `cAdvisor` service Again, we need to expose the `cadvisor` that is installed and managed by the `kubelet` daemon and allow webhook token authentication. To do so, we do the following on all the masters and nodes: -``` -sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf -sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" +```bash +KUBEADM_SYSTEMD_CONF=/etc/systemd/system/kubelet.service.d/10-kubeadm.conf +sed -e "/cadvisor-port=0/d" -i "$KUBEADM_SYSTEMD_CONF" +if ! grep -q "authentication-token-webhook=true" "$KUBEADM_SYSTEMD_CONF"; then + sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" -i "$KUBEADM_SYSTEMD_CONF" +fi systemctl daemon-reload systemctl restart kubelet ``` From e497e48e079f1d5d431bbaf79e32068e429e14a0 Mon Sep 17 00:00:00 2001 From: Riccardo Setti Date: Mon, 12 Mar 2018 19:41:03 +0100 Subject: [PATCH 210/638] Add how to enable cAdvisor support on GKE --- docs/GKE-cadvisor-support.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 docs/GKE-cadvisor-support.md diff --git a/docs/GKE-cadvisor-support.md b/docs/GKE-cadvisor-support.md new file mode 100644 index 00000000..aeb09514 --- /dev/null +++ b/docs/GKE-cadvisor-support.md @@ -0,0 +1,24 @@ +# Kubelet / cAdvisor special configuration updates for GKE + +In order to allow Prometheus to access the endpoints provided by the kubelet/cAdvisor on GKE we have to downgrade the scheme to HTTP (from HTTPS). + + +On linux: + +``` +sed -i -e 's/https/http/g' \ +contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +``` + +On MacOs: + +``` +sed -i '' -e 's/https/http/g' \ +contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +``` + +After you have modified the yaml file please run + +``` +kubectl apply -f contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +``` From 0bda22e3801e5b9e0f4f1d256963682acfec5e4d Mon Sep 17 00:00:00 2001 From: Laurent Godet Date: Fri, 9 Mar 2018 15:03:48 +0000 Subject: [PATCH 211/638] Update Prometheus to 2.2.1 --- manifests/prometheus/prometheus-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 6e7f239d..6ce11e9f 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v2.2.0-rc.1 + version: v2.2.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: From 3a6d38ea0e4184ce13d82d92921a100a079c0ddc Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 19 Mar 2018 13:34:18 +0100 Subject: [PATCH 212/638] contrib/kube: Restrict example-app servicemonitor to default ns The `prometheus-frontend` role of the example app kubeprometheus section is scoped to the default namespace. Thereby the frontend Prometheus instance is not able to discover anything outside of the default namespace. We might as well restrict the front end service monitor to the default namespace too. --- manifests/examples/example-app/servicemonitor-frontend.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml index cc3d42fa..067a7a89 100644 --- a/manifests/examples/example-app/servicemonitor-frontend.yaml +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -10,4 +10,7 @@ spec: tier: frontend endpoints: - port: web - interval: 10s \ No newline at end of file + interval: 10s + namespaceSelector: + matchNames: + - default From 2f302943fe96b5aaf474fe3ae5fab20330e8f0a1 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 19 Mar 2018 14:19:55 +0100 Subject: [PATCH 213/638] Unify rule file label accross repository In some cases we have been using `alert-rules` in some cases `prometheus-rulefiles`. This led to confusion [1]. Instead, unify the Prometheus rules configmap labels to: ```yaml labels: role: alert-rules ``` [1] https://github.com/coreos/prometheus-operator/issues/1102 --- hack/scripts/generate-rules-configmap.sh | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index 76ec64ca..96c5433f 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -6,7 +6,7 @@ kind: ConfigMap metadata: name: prometheus-k8s-rules labels: - role: prometheus-rulefiles + role: alert-rules prometheus: k8s data: EOF diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 05368dc1..0e548cec 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -3,7 +3,7 @@ kind: ConfigMap metadata: name: prometheus-k8s-rules labels: - role: prometheus-rulefiles + role: alert-rules prometheus: k8s data: alertmanager.rules.yaml: |+ diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 6ce11e9f..8f243eb0 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -13,7 +13,7 @@ spec: - {key: k8s-app, operator: Exists} ruleSelector: matchLabels: - role: prometheus-rulefiles + role: alert-rules prometheus: k8s resources: requests: From 310f471cf6bc160c07018fbe1441ef414b63d1ef Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 19 Mar 2018 17:03:42 +0100 Subject: [PATCH 214/638] contrib/kube: Add rbac role to discover prometheus The current example-app setup in the kube-prometheus project is able to discover scraping targets in the default namespace. It is not able to discover the configured Alertmanager in the monitoring namespace. This patch adds an alertmanager-discovery rbac role, to permit the above described action. In addition it does the following cleanups: - Remove kubeconfig configuration in deploy and teardown script. kubectl chooses .kube/config whenever KUBECONFIG is not set by default - Remove namespace specification option via NAMESPACE env var. In most of the manifests the metadata/namespace was hardcoded anyways, in addition in the promtheus frontend role binding the service account namespace is hardcoded to default as well. - Instead of `kubectl {apply,delete}` individual manifests, the deploy and teardown shell scripts {apply,delete} on the entire folder. --- hack/example-service-monitoring/deploy | 17 +---------------- hack/example-service-monitoring/teardown | 11 +---------- manifests/examples/example-app/example-app.yaml | 2 ++ ...end-alertmanager-discovery-role-binding.yaml | 13 +++++++++++++ ...us-frontend-alertmanager-discovery-role.yaml | 12 ++++++++++++ .../prometheus-frontend-service-account.yaml | 1 + .../example-app/prometheus-frontend-svc.yaml | 1 + .../example-app/servicemonitor-frontend.yaml | 3 ++- 8 files changed, 33 insertions(+), 27 deletions(-) create mode 100644 manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml create mode 100644 manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy index ba922b84..18b0ef6a 100755 --- a/hack/example-service-monitoring/deploy +++ b/hack/example-service-monitoring/deploy @@ -1,18 +1,3 @@ #!/usr/bin/env bash -if [ -z "${KUBECONFIG}" ]; then - KUBECONFIG=~/.kube/config -fi - -if [ -z "${NAMESPACE}" ]; then - NAMESPACE=default -fi - -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-service-account.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-role.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-role-binding.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend-svc.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/example-app.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/prometheus-frontend.yaml -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" apply -f manifests/examples/example-app/servicemonitor-frontend.yaml - +kubectl apply -f manifests/examples/example-app diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown index a631fe3e..a5fc1760 100755 --- a/hack/example-service-monitoring/teardown +++ b/hack/example-service-monitoring/teardown @@ -1,12 +1,3 @@ #!/usr/bin/env bash -if [ -z "${KUBECONFIG}" ]; then - KUBECONFIG=~/.kube/config -fi - -if [ -z "${NAMESPACE}" ]; then - NAMESPACE=default -fi - -kubectl --namespace "$NAMESPACE" --kubeconfig="$KUBECONFIG" delete -f manifests/examples/example-app - +kubectl delete -f manifests/examples/example-app diff --git a/manifests/examples/example-app/example-app.yaml b/manifests/examples/example-app/example-app.yaml index adb6602b..0262fd3c 100644 --- a/manifests/examples/example-app/example-app.yaml +++ b/manifests/examples/example-app/example-app.yaml @@ -4,6 +4,7 @@ metadata: name: example-app labels: tier: frontend + namespace: default spec: selector: app: example-app @@ -17,6 +18,7 @@ apiVersion: extensions/v1beta1 kind: Deployment metadata: name: example-app + namespace: default spec: replicas: 4 template: diff --git a/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml b/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml new file mode 100644 index 00000000..09b3f5e4 --- /dev/null +++ b/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-frontend + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: alertmanager-discovery +subjects: +- kind: ServiceAccount + name: prometheus-frontend + namespace: default diff --git a/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml b/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml new file mode 100644 index 00000000..84319cdd --- /dev/null +++ b/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: alertmanager-discovery + namespace: monitoring +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["list", "watch"] diff --git a/manifests/examples/example-app/prometheus-frontend-service-account.yaml b/manifests/examples/example-app/prometheus-frontend-service-account.yaml index abd3e0df..4dd7c26b 100644 --- a/manifests/examples/example-app/prometheus-frontend-service-account.yaml +++ b/manifests/examples/example-app/prometheus-frontend-service-account.yaml @@ -2,3 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-frontend + namespace: default diff --git a/manifests/examples/example-app/prometheus-frontend-svc.yaml b/manifests/examples/example-app/prometheus-frontend-svc.yaml index 6a269155..7002e8f1 100644 --- a/manifests/examples/example-app/prometheus-frontend-svc.yaml +++ b/manifests/examples/example-app/prometheus-frontend-svc.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Service metadata: name: prometheus-frontend + namespace: default spec: type: NodePort ports: diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml index cc3d42fa..709ab2cb 100644 --- a/manifests/examples/example-app/servicemonitor-frontend.yaml +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -2,6 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: frontend + namespace: default labels: tier: frontend spec: @@ -10,4 +11,4 @@ spec: tier: frontend endpoints: - port: web - interval: 10s \ No newline at end of file + interval: 10s From 07263e98c0a13b29c74afae109d92e8baa7f9192 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 9 Mar 2018 15:12:59 +0100 Subject: [PATCH 215/638] *: upgrade client-go --- manifests/prometheus-operator/prometheus-operator.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 73affb96..f68239b7 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -1,4 +1,4 @@ -apiVersion: extensions/v1beta1 +apiVersion: apps/v1beta2 kind: Deployment metadata: labels: @@ -6,6 +6,9 @@ metadata: name: prometheus-operator spec: replicas: 1 + selector: + matchLabels: + k8s-app: prometheus-operator template: metadata: labels: From 4c77a9db1d2cc8f5c84e961c6083758e7039695e Mon Sep 17 00:00:00 2001 From: Alexander Holte-Davidsen Date: Thu, 22 Mar 2018 11:32:38 +0100 Subject: [PATCH 216/638] Update Alert Manager rules for NodeDiskRunningFull with summary --- assets/prometheus/rules/node.rules.yaml | 3 ++- manifests/prometheus/prometheus-k8s-rules.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index d14f0870..9a9d599c 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -35,6 +35,7 @@ groups: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 24 hours - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 for: 10m @@ -43,4 +44,4 @@ groups: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full + summary: Node disk is running full within 2 hours diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 0e548cec..470d27c9 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -531,6 +531,7 @@ data: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 24 hours - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 for: 10m @@ -539,7 +540,7 @@ data: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full + summary: Node disk is running full within 2 hours prometheus.rules.yaml: |+ groups: - name: prometheus.rules From 3246c489136b336a74746d27e7f378875aa97ac0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 23 Mar 2018 14:45:46 +0100 Subject: [PATCH 217/638] prometheus: Add ability to whitelist Kubernetes labels --- manifests/examples/example-app/example-app.yaml | 8 ++++---- .../examples/example-app/servicemonitor-frontend.yaml | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/manifests/examples/example-app/example-app.yaml b/manifests/examples/example-app/example-app.yaml index 0262fd3c..708e8afa 100644 --- a/manifests/examples/example-app/example-app.yaml +++ b/manifests/examples/example-app/example-app.yaml @@ -1,13 +1,13 @@ kind: Service apiVersion: v1 -metadata: +metadata: name: example-app labels: tier: frontend namespace: default -spec: - selector: - app: example-app +spec: + selector: + app: example-app ports: - name: web protocol: TCP diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/manifests/examples/example-app/servicemonitor-frontend.yaml index 23f27376..2bc935af 100644 --- a/manifests/examples/example-app/servicemonitor-frontend.yaml +++ b/manifests/examples/example-app/servicemonitor-frontend.yaml @@ -9,6 +9,8 @@ spec: selector: matchLabels: tier: frontend + targetLabels: + - tier endpoints: - port: web interval: 10s From e50adc092622a6bb4e8a5491671683561f949aa6 Mon Sep 17 00:00:00 2001 From: Michael Pietzsch Date: Mon, 26 Mar 2018 11:25:41 +0200 Subject: [PATCH 218/638] Grafana Update to 5.0.3 (#1149) --- grafana-image/Makefile | 2 +- .../templates/grafana-deployment-template.yaml | 2 +- manifests/grafana/grafana-deployment.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grafana-image/Makefile b/grafana-image/Makefile index a957a7e9..8df556c6 100644 --- a/grafana-image/Makefile +++ b/grafana-image/Makefile @@ -1,4 +1,4 @@ -VERSION=5.0.0 +VERSION=5.0.3 IMAGE_TAG=$(VERSION) container: diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml index c5fc633c..9b978e90 100644 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0 + image: quay.io/coreos/monitoring-grafana:5.0.3 volumeMounts: - name: grafana-storage mountPath: /data diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 2c5fed77..9eb8750f 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -14,7 +14,7 @@ spec: runAsUser: 65534 containers: - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.0 + image: quay.io/coreos/monitoring-grafana:5.0.3 volumeMounts: - name: grafana-storage mountPath: /data From 2c10f81102b6d386706977efd54737c0add8a8cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20GLON?= Date: Mon, 26 Mar 2018 18:11:40 +0200 Subject: [PATCH 219/638] Add new alert for samples rejected due ti duplicate timestamp (#1148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien GLON --- assets/prometheus/rules/prometheus.rules.yaml | 9 +++++++++ manifests/prometheus/prometheus-k8s-rules.yaml | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index 43f2808c..da699c32 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -90,3 +90,12 @@ groups: annotations: description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." summary: "Prometheus isn't ingesting samples" + + - alert: PrometheusTargetScapesDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 10m + labels: + severity: warning + annotations: + description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" + summary: Prometheus has many samples rejected diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 470d27c9..403e4383 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -634,3 +634,12 @@ data: annotations: description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." summary: "Prometheus isn't ingesting samples" + + - alert: PrometheusTargetScapesDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 10m + labels: + severity: warning + annotations: + description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" + summary: Prometheus has many samples rejected From b566db5d4bede3f9d6c0f9fe76472d64bcc73d2d Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 4 Apr 2018 10:06:38 +0200 Subject: [PATCH 220/638] *: Bump version to v0.18.0 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index f68239b7..3111cbb7 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -18,7 +18,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.17.0 + image: quay.io/coreos/prometheus-operator:v0.18.0 name: prometheus-operator ports: - containerPort: 8080 From 889eca2cd47a01a76afe6de02902d474203a734c Mon Sep 17 00:00:00 2001 From: Richard Maynard Date: Thu, 5 Apr 2018 02:57:56 -0500 Subject: [PATCH 221/638] charts: Add Core DNS Support (#1176) * charts: Add Core DNS Support With CoreDNS becoming a first class citizen it should be an optional part of the kube-prometheus deployment. Fixes: #1174 * added kube-prometheus service and service monitor In order to work directly with a cluster created by bootkube used port 9153, and created a service exposing metrics since bootkube does does not. Also in the helm chart changed the default port to 9153 since that is the coredns plugins default port and to be consistent throughout the repo. --- ...rometheus-k8s-service-coredns-metrics.yaml | 18 ++++++++++++++++++ ...rometheus-k8s-service-monitor-coredns.yaml | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml create mode 100644 manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml new file mode 100644 index 00000000..cd90a55e --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: coredns-prometheus-discovery + namespace: kube-system + labels: + k8s-app: coredns + component: metrics +spec: + ports: + - name: http-metrics + port: 9153 + protocol: TCP + targetPort: 9153 + selector: + k8s-app: coredns + type: ClusterIP + clusterIP: None diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml new file mode 100644 index 00000000..362ac899 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: coredns + name: coredns +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: coredns + component: metrics + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: http-metrics + interval: 15s + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token From b6e2f2ae3132e6f727463a8a3c713c356ce31db8 Mon Sep 17 00:00:00 2001 From: Keyvan Hedayati Date: Sat, 7 Apr 2018 17:09:22 +0430 Subject: [PATCH 222/638] Minor typo fix --- docs/Monitoring external etcd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Monitoring external etcd.md b/docs/Monitoring external etcd.md index 2bb79698..edf9af42 100644 --- a/docs/Monitoring external etcd.md +++ b/docs/Monitoring external etcd.md @@ -51,7 +51,7 @@ The below manifest creates a Service to expose etcd metrics (port 2379) In case you have generated the etcd certificated with kube-aws, you will need to use insecureSkipVerify as the valid certificate domain will be different for each etcd node (etcd0, etcd1, etcd2). If you only have one etcd node, you can use the value from `etcd.internalDomainName` speficied in your kube-aws `cluster.yaml` -In this example we use insecureSkipVerify: true as kube-aws default certiicates are not valid against the IP. They were created for the DNS. Depending on your use case, you might want to remove this flag or set it to false. (true required for kube-aws if using default certificate generators method) +In this example we use insecureSkipVerify: true as kube-aws default certificates are not valid against the IP. They were created for the DNS. Depending on your use case, you might want to remove this flag or set it to false. (true required for kube-aws if using default certificate generators method) ``` apiVersion: v1 From 46c49616e06d56b7ea7408f080a289d995c0301d Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 9 Apr 2018 14:27:52 +0200 Subject: [PATCH 223/638] *: Bump version to v0.18.1 --- manifests/prometheus-operator/prometheus-operator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 3111cbb7..d0030111 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -18,7 +18,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.18.0 + image: quay.io/coreos/prometheus-operator:v0.18.1 name: prometheus-operator ports: - containerPort: 8080 From b10e34368972ed30b1eefae81298808875d7a6f2 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 10 Apr 2018 10:27:54 +0200 Subject: [PATCH 224/638] kube-prometheus: Fix minor typo --- assets/prometheus/rules/kube-state-metrics.rules.yaml | 2 +- manifests/prometheus/prometheus-k8s-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml index 9325df0b..2a5b9527 100644 --- a/assets/prometheus/rules/kube-state-metrics.rules.yaml +++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml @@ -54,6 +54,6 @@ groups: labels: severity: warning annotations: - description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + description: Pod {{$labels.namespaces}}/{{$labels.pod}} was restarted {{$value}} times within the last hour summary: Pod is restarting frequently diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 403e4383..7a8911d3 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -334,7 +334,7 @@ data: labels: severity: warning annotations: - description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + description: Pod {{$labels.namespaces}}/{{$labels.pod}} was restarted {{$value}} times within the last hour summary: Pod is restarting frequently kubelet.rules.yaml: |+ From f9b03ddd9d92143645a248ae34956efe0db38f31 Mon Sep 17 00:00:00 2001 From: Andrey Klimentyev Date: Tue, 10 Apr 2018 14:54:48 +0300 Subject: [PATCH 225/638] kube-prometheus: fixed CPU accounting Currently, node recording rules feature an incorrect idle CPU accounting. This change aims to fix that. --- assets/prometheus/rules/node.rules.yaml | 6 +++--- manifests/prometheus/prometheus-k8s-rules.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 9a9d599c..e678ca84 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -2,7 +2,7 @@ groups: - name: node.rules rules: - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) @@ -12,10 +12,10 @@ groups: - record: instance:node_network_transmit_bytes:rate:sum expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle"}[5m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:ratio expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 403e4383..552aad5e 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -498,7 +498,7 @@ data: - name: node.rules rules: - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) @@ -508,10 +508,10 @@ data: - record: instance:node_network_transmit_bytes:rate:sum expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle"}[5m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:ratio expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown From a2d273b11a651cf99fb97993e0b11be9404002c9 Mon Sep 17 00:00:00 2001 From: Arslanbekov Denis Date: Tue, 10 Apr 2018 18:18:24 +0300 Subject: [PATCH 226/638] In description is displayed correctly namespace (#1190) * in description is displayed correctly namespace * Bump kube state version * Update Chart.yaml --- assets/prometheus/rules/kube-state-metrics.rules.yaml | 8 ++++---- manifests/prometheus/prometheus-k8s-rules.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml index 2a5b9527..4c7041fe 100644 --- a/assets/prometheus/rules/kube-state-metrics.rules.yaml +++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml @@ -8,7 +8,7 @@ groups: severity: warning annotations: description: Observed deployment generation does not match expected one for - deployment {{$labels.namespaces}}/{{$labels.deployment}} + deployment {{$labels.namespace}}/{{$labels.deployment}} summary: Deployment is outdated - alert: DeploymentReplicasNotUpdated expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) @@ -18,7 +18,7 @@ groups: labels: severity: warning annotations: - description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}} summary: Deployment replicas are outdated - alert: DaemonSetRolloutStuck expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled @@ -28,7 +28,7 @@ groups: severity: warning annotations: description: Only {{$value}}% of desired pods scheduled and ready for daemon - set {{$labels.namespaces}}/{{$labels.daemonset}} + set {{$labels.namespace}}/{{$labels.daemonset}} summary: DaemonSet is missing pods - alert: K8SDaemonSetsNotScheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled @@ -54,6 +54,6 @@ groups: labels: severity: warning annotations: - description: Pod {{$labels.namespaces}}/{{$labels.pod}} was restarted {{$value}} + description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} times within the last hour summary: Pod is restarting frequently diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 7a8911d3..caea8d08 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -288,7 +288,7 @@ data: severity: warning annotations: description: Observed deployment generation does not match expected one for - deployment {{$labels.namespaces}}/{{$labels.deployment}} + deployment {{$labels.namespace}}/{{$labels.deployment}} summary: Deployment is outdated - alert: DeploymentReplicasNotUpdated expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) @@ -298,7 +298,7 @@ data: labels: severity: warning annotations: - description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}} summary: Deployment replicas are outdated - alert: DaemonSetRolloutStuck expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled @@ -308,7 +308,7 @@ data: severity: warning annotations: description: Only {{$value}}% of desired pods scheduled and ready for daemon - set {{$labels.namespaces}}/{{$labels.daemonset}} + set {{$labels.namespace}}/{{$labels.daemonset}} summary: DaemonSet is missing pods - alert: K8SDaemonSetsNotScheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled @@ -334,7 +334,7 @@ data: labels: severity: warning annotations: - description: Pod {{$labels.namespaces}}/{{$labels.pod}} was restarted {{$value}} + description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} times within the last hour summary: Pod is restarting frequently kubelet.rules.yaml: |+ From d8692794a9a4233d21ae88a9114294a84ac5fe1c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 8 Apr 2018 14:53:30 +0200 Subject: [PATCH 227/638] kube-prometheus: Convert to jsonnet --- .gitignore | 1 + build.sh | 19 ++++ .../alertmanager-main-secret.libsonnet | 25 ++++++ ...lertmanager-main-service-account.libsonnet | 8 ++ .../alertmanager-main-service.libsonnet | 12 +++ .../alertmanager/alertmanager-main.libsonnet | 19 ++++ jsonnet/alertmanager/alertmanager.libsonnet | 6 ++ jsonnet/kube-prometheus.jsonnet | 62 +++++++++++++ ...ate-metrics-cluster-role-binding.libsonnet | 12 +++ .../kube-state-metrics-cluster-role.libsonnet | 75 ++++++++++++++++ .../kube-state-metrics-deployment.libsonnet | 86 +++++++++++++++++++ .../kube-state-metrics-role-binding.libsonnet | 13 +++ .../kube-state-metrics-role.libsonnet | 28 ++++++ ...be-state-metrics-service-account.libsonnet | 8 ++ .../kube-state-metrics-service.libsonnet | 15 ++++ .../kube-state-metrics.libsonnet | 9 ++ ...de-exporter-cluster-role-binding.libsonnet | 12 +++ .../node-exporter-cluster-role.libsonnet | 26 ++++++ .../node-exporter-daemonset.libsonnet | 58 +++++++++++++ .../node-exporter-service-account.libsonnet | 8 ++ .../node-exporter-service.libsonnet | 14 +++ jsonnet/node-exporter/node-exporter.libsonnet | 7 ++ ...us-operator-cluster-role-binding.libsonnet | 12 +++ ...prometheus-operator-cluster-role.libsonnet | 80 +++++++++++++++++ .../prometheus-operator-deployment.libsonnet | 30 +++++++ ...metheus-operator-service-account.libsonnet | 8 ++ .../prometheus-operator-service.libsonnet | 14 +++ .../prometheus-operator.libsonnet | 7 ++ ...metheus-k8s-cluster-role-binding.libsonnet | 12 +++ .../prometheus-k8s-cluster-role.libsonnet | 21 +++++ ...ometheus-k8s-role-binding-config.libsonnet | 5 ++ ...metheus-k8s-role-binding-default.libsonnet | 5 ++ ...eus-k8s-role-binding-kube-system.libsonnet | 5 ++ ...theus-k8s-role-binding-namespace.libsonnet | 5 ++ .../prometheus-k8s-role-config.libsonnet | 18 ++++ .../prometheus-k8s-role-default.libsonnet | 5 ++ .../prometheus-k8s-role-kube-system.libsonnet | 5 ++ .../prometheus-k8s-role-namespace.libsonnet | 5 ++ .../prometheus-k8s-service-account.libsonnet | 8 ++ ...k8s-service-monitor-alertmanager.libsonnet | 32 +++++++ ...us-k8s-service-monitor-apiserver.libsonnet | 40 +++++++++ ...heus-k8s-service-monitor-coredns.libsonnet | 35 ++++++++ ...-monitor-kube-controller-manager.libsonnet | 33 +++++++ ...s-service-monitor-kube-scheduler.libsonnet | 33 +++++++ ...rvice-monitor-kube-state-metrics.libsonnet | 48 +++++++++++ ...heus-k8s-service-monitor-kubelet.libsonnet | 49 +++++++++++ ...8s-service-monitor-node-exporter.libsonnet | 38 ++++++++ ...vice-monitor-prometheus-operator.libsonnet | 26 ++++++ ...s-k8s-service-monitor-prometheus.libsonnet | 32 +++++++ .../prometheus-k8s-service.libsonnet | 13 +++ jsonnet/prometheus/prometheus-k8s.libsonnet | 43 ++++++++++ ...rometheus-namespace-role-binding.libsonnet | 13 +++ .../prometheus-namespace-role.libsonnet | 21 +++++ jsonnet/prometheus/prometheus.libsonnet | 25 ++++++ 54 files changed, 1249 insertions(+) create mode 100644 .gitignore create mode 100755 build.sh create mode 100644 jsonnet/alertmanager/alertmanager-main-secret.libsonnet create mode 100644 jsonnet/alertmanager/alertmanager-main-service-account.libsonnet create mode 100644 jsonnet/alertmanager/alertmanager-main-service.libsonnet create mode 100644 jsonnet/alertmanager/alertmanager-main.libsonnet create mode 100644 jsonnet/alertmanager/alertmanager.libsonnet create mode 100644 jsonnet/kube-prometheus.jsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet create mode 100644 jsonnet/kube-state-metrics/kube-state-metrics.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter-cluster-role.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter-daemonset.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter-service-account.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter-service.libsonnet create mode 100644 jsonnet/node-exporter/node-exporter.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator-service.libsonnet create mode 100644 jsonnet/prometheus-operator/prometheus-operator.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-config.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-default.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-account.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-service.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s.libsonnet create mode 100644 jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet create mode 100644 jsonnet/prometheus/prometheus-namespace-role.libsonnet create mode 100644 jsonnet/prometheus/prometheus.libsonnet diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..3fec32c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tmp/ diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..a42a6bb7 --- /dev/null +++ b/build.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -e +set -x + +prefix="tmp/manifests" +json="tmp/manifests.json" + +rm -rf ${prefix} +mkdir -p $(dirname "${json}") +jsonnet -J /home/brancz/.jsonnet-bundler/src/git/git@github.com-ksonnet-ksonnet-lib/master jsonnet/kube-prometheus.jsonnet > ${json} + +files=$(jq -r 'keys[]' ${json}) + +for file in ${files}; do + dir=$(dirname "${file}") + path="${prefix}/${dir}" + mkdir -p ${path} + jq -r ".[\"${file}\"]" ${json} | yaml2json | json2yaml > "${prefix}/${file}" +done diff --git a/jsonnet/alertmanager/alertmanager-main-secret.libsonnet b/jsonnet/alertmanager/alertmanager-main-secret.libsonnet new file mode 100644 index 00000000..fca16566 --- /dev/null +++ b/jsonnet/alertmanager/alertmanager-main-secret.libsonnet @@ -0,0 +1,25 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local secret = k.core.v1.secret; + +local plainConfig = "global: + resolve_timeout: 5m +route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' +receivers: +- name: 'null'"; + +local config = std.base64(plainConfig); + +{ + new(namespace):: + secret.new("alertmanager-main", {"alertmanager.yaml": config}) + + secret.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet b/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet new file mode 100644 index 00000000..89ca2f80 --- /dev/null +++ b/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet @@ -0,0 +1,8 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local serviceAccount = k.core.v1.serviceAccount; + +{ + new(namespace):: + serviceAccount.new("alertmanager-main") + + serviceAccount.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/alertmanager/alertmanager-main-service.libsonnet b/jsonnet/alertmanager/alertmanager-main-service.libsonnet new file mode 100644 index 00000000..e89f009f --- /dev/null +++ b/jsonnet/alertmanager/alertmanager-main-service.libsonnet @@ -0,0 +1,12 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +local alertmanagerPort = servicePort.newNamed("web", 9093, "web"); + +{ + new(namespace):: + service.new("alertmanager-main", {app: "alertmanager", alertmanager: "main"}, alertmanagerPort) + + service.mixin.metadata.withNamespace(namespace) + + service.mixin.metadata.withLabels({alertmanager: "main"}) +} diff --git a/jsonnet/alertmanager/alertmanager-main.libsonnet b/jsonnet/alertmanager/alertmanager-main.libsonnet new file mode 100644 index 00000000..63e06a16 --- /dev/null +++ b/jsonnet/alertmanager/alertmanager-main.libsonnet @@ -0,0 +1,19 @@ +{ + new(namespace):: + { + apiVersion: "monitoring.coreos.com/v1", + kind: "Alertmanager", + metadata: { + name: "main", + namespace: namespace, + labels: { + alertmanager: "main", + }, + }, + spec: { + replicas: 3, + version: "v0.14.0", + serviceAccountName: "alertmanager-main", + }, + } +} diff --git a/jsonnet/alertmanager/alertmanager.libsonnet b/jsonnet/alertmanager/alertmanager.libsonnet new file mode 100644 index 00000000..ef837aba --- /dev/null +++ b/jsonnet/alertmanager/alertmanager.libsonnet @@ -0,0 +1,6 @@ +{ + config:: import "alertmanager-main-secret.libsonnet", + serviceAccount:: import "alertmanager-main-service-account.libsonnet", + service:: import "alertmanager-main-service.libsonnet", + alertmanager:: import "alertmanager-main.libsonnet", +} diff --git a/jsonnet/kube-prometheus.jsonnet b/jsonnet/kube-prometheus.jsonnet new file mode 100644 index 00000000..3a0ef2cf --- /dev/null +++ b/jsonnet/kube-prometheus.jsonnet @@ -0,0 +1,62 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; + +local alertmanager = import "alertmanager/alertmanager.libsonnet"; +local ksm = import "kube-state-metrics/kube-state-metrics.libsonnet"; +local nodeExporter = import "node-exporter/node-exporter.libsonnet"; +local po = import "prometheus-operator/prometheus-operator.libsonnet"; +local prometheus = import "prometheus/prometheus.libsonnet"; + +local namespace = "monitoring"; + +local objects = { + "alertmanager-main/alertmanager-main-secret.yaml": alertmanager.config.new(namespace), + "alertmanager-main/alertmanager-main-service-account.yaml": alertmanager.serviceAccount.new(namespace), + "alertmanager-main/alertmanager-main-service.yaml": alertmanager.service.new(namespace), + "alertmanager-main/alertmanager-main.yaml": alertmanager.alertmanager.new(namespace), + + "kube-state-metrics/kube-state-metrics-cluster-role-binding": ksm.clusterRoleBinding.new(namespace), + "kube-state-metrics/kube-state-metrics-cluster-role.yaml": ksm.clusterRole.new(), + "kube-state-metrics/kube-state-metrics-deployment.yaml": ksm.deployment.new(namespace), + "kube-state-metrics/kube-state-metrics-role-binding.yaml": ksm.roleBinding.new(namespace), + "kube-state-metrics/kube-state-metrics-role.yaml": ksm.role.new(namespace), + "kube-state-metrics/kube-state-metrics-service-account.yaml": ksm.serviceAccount.new(namespace), + "kube-state-metrics/kube-state-metrics-service.yaml": ksm.service.new(namespace), + + "node-exporter/node-exporter-cluster-role-binding.yaml": nodeExporter.clusterRoleBinding.new(namespace), + "node-exporter/node-exporter-cluster-role.yaml": nodeExporter.clusterRole.new(), + "node-exporter/node-exporter-daemonset.yaml": nodeExporter.daemonset.new(namespace), + "node-exporter/node-exporter-service-account.yaml": nodeExporter.serviceAccount.new(namespace), + "node-exporter/node-exporter-service.yaml": nodeExporter.service.new(namespace), + + "prometheus-operator/prometheus-operator-cluster-role-binding.yaml": po.clusterRoleBinding.new(namespace), + "prometheus-operator/prometheus-operator-cluster-role.yaml": po.clusterRole.new(), + "prometheus-operator/prometheus-operator-deployment.yaml": po.deployment.new(namespace), + "prometheus-operator/prometheus-operator-service.yaml": po.service.new(namespace), + "prometheus-operator/prometheus-operator-service-account.yaml": po.serviceAccount.new(namespace), + + "prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml": prometheus.clusterRoleBinding.new(namespace), + "prometheus-k8s/prometheus-k8s-cluster-role.yaml": prometheus.clusterRole.new(), + "prometheus-k8s/prometheus-k8s-service-account.yaml": prometheus.serviceAccount.new(namespace), + "prometheus-k8s/prometheus-k8s-service.yaml": prometheus.service.new(namespace), + "prometheus-k8s/prometheus-k8s.yaml": prometheus.prometheus.new(namespace), + "prometheus-k8s/prometheus-k8s-role-binding-config.yaml": prometheus.roleBindingConfig.new(namespace), + "prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml": prometheus.roleBindingNamespace.new(namespace), + "prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml": prometheus.roleBindingKubeSystem.new(namespace), + "prometheus-k8s/prometheus-k8s-role-binding-default.yaml": prometheus.roleBindingDefault.new(namespace), + "prometheus-k8s/prometheus-k8s-role-config.yaml": prometheus.roleConfig.new(namespace), + "prometheus-k8s/prometheus-k8s-role-namespace.yaml": prometheus.roleNamespace.new(namespace), + "prometheus-k8s/prometheus-k8s-role-kube-system.yaml": prometheus.roleKubeSystem.new(), + "prometheus-k8s/prometheus-k8s-role-default.yaml": prometheus.roleDefault.new(), + "prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml": prometheus.serviceMonitorAlertmanager.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml": prometheus.serviceMonitorApiserver.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml": prometheus.serviceMonitorCoreDNS.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml": prometheus.serviceMonitorControllerManager.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml": prometheus.serviceMonitorScheduler.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml": prometheus.serviceMonitorKubeStateMetrics.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml": prometheus.serviceMonitorKubelet.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml": prometheus.serviceMonitorNodeExporter.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml": prometheus.serviceMonitorPrometheusOperator.new(namespace), + "prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml": prometheus.serviceMonitorPrometheus.new(namespace), +}; + +{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet new file mode 100644 index 00000000..ae150c35 --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet @@ -0,0 +1,12 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + +{ + new(namespace):: + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName("kube-state-metrics") + + clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + clusterRoleBinding.mixin.roleRef.withName("kube-state-metrics") + + clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + + clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "kube-state-metrics", namespace: namespace}]) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet new file mode 100644 index 00000000..976d850a --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet @@ -0,0 +1,75 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRole = k.rbac.v1.clusterRole; +local policyRule = clusterRole.rulesType; + +local coreRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "configmaps", + "secrets", + "nodes", + "pods", + "services", + "resourcequotas", + "replicationcontrollers", + "limitranges", + "persistentvolumeclaims", + "persistentvolumes", + "namespaces", + "endpoints", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local extensionsRule = policyRule.new() + + policyRule.withApiGroups(["extensions"]) + + policyRule.withResources([ + "daemonsets", + "deployments", + "replicasets", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local appsRule = policyRule.new() + + policyRule.withApiGroups(["apps"]) + + policyRule.withResources([ + "statefulsets", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local batchRule = policyRule.new() + + policyRule.withApiGroups(["batch"]) + + policyRule.withResources([ + "cronjobs", + "jobs", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local autoscalingRule = policyRule.new() + + policyRule.withApiGroups(["autoscaling"]) + + policyRule.withResources([ + "horizontalpodautoscalers", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local authenticationRole = policyRule.new() + + policyRule.withApiGroups(["authentication.k8s.io"]) + + policyRule.withResources([ + "tokenreviews", + ]) + + policyRule.withVerbs(["create"]); + +local authorizationRole = policyRule.new() + + policyRule.withApiGroups(["authorization.k8s.io"]) + + policyRule.withResources([ + "subjectaccessreviews", + ]) + + policyRule.withVerbs(["create"]); + +local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole]; + +{ + new():: + clusterRole.new() + + clusterRole.mixin.metadata.withName("kube-state-metrics") + + clusterRole.withRules(rules) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet new file mode 100644 index 00000000..e873fa30 --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet @@ -0,0 +1,86 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local deployment = k.apps.v1beta2.deployment; + +local deployment = k.apps.v1beta2.deployment; +local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; +local volume = k.apps.v1beta2.deployment.mixin.spec.template.spec.volumesType; +local containerPort = container.portsType; +local containerVolumeMount = container.volumeMountsType; +local podSelector = deployment.mixin.spec.template.spec.selectorType; + +local kubeStateMetricsVersion = "v1.3.0"; +local kubeRbacProxyVersion = "v0.3.0"; +local addonResizerVersion = "1.0"; +local podLabels = {"app": "kube-state-metrics"}; + +local proxyClusterMetrics = + container.new("kube-rbac-proxy-main", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + + container.withArgs([ + "--secure-listen-address=:8443", + "--upstream=http://127.0.0.1:8081/", + ]) + + container.withPorts(containerPort.newNamed("https-main", 8443)) + + container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + + container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); + +local proxySelfMetrics = + container.new("kube-rbac-proxy-self", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + + container.withArgs([ + "--secure-listen-address=:9443", + "--upstream=http://127.0.0.1:8082/", + ]) + + container.withPorts(containerPort.newNamed("https-self", 9443)) + + container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + + container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); + +local kubeStateMetrics = + container.new("kube-state-metrics", "quay.io/coreos/kube-state-metrics:" + kubeStateMetricsVersion) + + container.withArgs([ + "--host=127.0.0.1", + "--port=8081", + "--telemetry-host=127.0.0.1", + "--telemetry-port=8082", + ]) + + container.mixin.resources.withRequests({cpu: "102m", memory: "180Mi"}) + + container.mixin.resources.withLimits({cpu: "102m", memory: "180Mi"}); + +local addonResizer = + container.new("addon-resizer", "quay.io/coreos/addon-resizer:" + addonResizerVersion) + + container.withCommand([ + "/pod_nanny", + "--container=kube-state-metrics", + "--cpu=100m", + "--extra-cpu=2m", + "--memory=150Mi", + "--extra-memory=30Mi", + "--threshold=5", + "--deployment=kube-state-metrics", + ]) + + container.withEnv([ + { + name: "MY_POD_NAME", + valueFrom: { + fieldRef: {apiVersion: "v1", fieldPath: "metadata.name"} + } + }, { + name: "MY_POD_NAMESPACE", + valueFrom: { + fieldRef: {apiVersion: "v1", fieldPath: "metadata.namespace"} + } + } + ]) + + container.mixin.resources.withRequests({cpu: "10m", memory: "30Mi"}) + + container.mixin.resources.withLimits({cpu: "10m", memory: "30Mi"}); + +local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics, addonResizer]; + +{ + new(namespace):: + deployment.new("kube-state-metrics", 1, c, podLabels) + + deployment.mixin.metadata.withNamespace(namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels) + + deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + deployment.mixin.spec.template.spec.withServiceAccountName("kube-state-metrics") +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet new file mode 100644 index 00000000..bd9b03ae --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet @@ -0,0 +1,13 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local roleBinding = k.rbac.v1.roleBinding; + +{ + new(namespace):: + roleBinding.new() + + roleBinding.mixin.metadata.withName("kube-state-metrics") + + roleBinding.mixin.metadata.withNamespace(namespace) + + roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + roleBinding.mixin.roleRef.withName("kube-state-metrics-addon-resizer") + + roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + + roleBinding.withSubjects([{kind: "ServiceAccount", name: "kube-state-metrics"}]) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet new file mode 100644 index 00000000..bf80880f --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet @@ -0,0 +1,28 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local role = k.rbac.v1.role; +local policyRule = role.rulesType; + +local coreRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "pods", + ]) + + policyRule.withVerbs(["get"]); + +local extensionsRule = policyRule.new() + + policyRule.withApiGroups(["extensions"]) + + policyRule.withResources([ + "deployments", + ]) + + policyRule.withVerbs(["get", "update"]) + + policyRule.withResourceNames(["kube-state-metrics"]); + +local rules = [coreRule, extensionsRule]; + +{ + new(namespace):: + role.new() + + role.mixin.metadata.withName("kube-state-metrics") + + role.mixin.metadata.withNamespace(namespace) + + role.withRules(rules) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet new file mode 100644 index 00000000..6e6904ff --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet @@ -0,0 +1,8 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local serviceAccount = k.core.v1.serviceAccount; + +{ + new(namespace):: + serviceAccount.new("kube-state-metrics") + + serviceAccount.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet new file mode 100644 index 00000000..c8eaee18 --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet @@ -0,0 +1,15 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +local ksmDeployment = import "kube-state-metrics-deployment.libsonnet"; + +local ksmServicePortMain = servicePort.newNamed("https-main", 8443, "https-main"); +local ksmServicePortSelf = servicePort.newNamed("https-self", 9443, "https-self"); + +{ + new(namespace):: + service.new("kube-state-metrics", ksmDeployment.new(namespace).spec.selector.matchLabels, [ksmServicePortMain, ksmServicePortSelf]) + + service.mixin.metadata.withNamespace(namespace) + + service.mixin.metadata.withLabels({"k8s-app": "kube-state-metrics"}) +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet new file mode 100644 index 00000000..3f9b8ba2 --- /dev/null +++ b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet @@ -0,0 +1,9 @@ +{ + clusterRoleBinding:: import "kube-state-metrics-cluster-role-binding.libsonnet", + clusterRole:: import "kube-state-metrics-cluster-role.libsonnet", + deployment:: import "kube-state-metrics-deployment.libsonnet", + roleBinding:: import "kube-state-metrics-role-binding.libsonnet", + role:: import "kube-state-metrics-role.libsonnet", + serviceAccount:: import "kube-state-metrics-service-account.libsonnet", + service:: import "kube-state-metrics-service.libsonnet", +} diff --git a/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet b/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet new file mode 100644 index 00000000..39f373b5 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet @@ -0,0 +1,12 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + +{ + new(namespace):: + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName("node-exporter") + + clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + clusterRoleBinding.mixin.roleRef.withName("node-exporter") + + clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + + clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "node-exporter", namespace: namespace}]) +} diff --git a/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet b/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet new file mode 100644 index 00000000..426e0a66 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet @@ -0,0 +1,26 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRole = k.rbac.v1.clusterRole; +local policyRule = clusterRole.rulesType; + +local authenticationRole = policyRule.new() + + policyRule.withApiGroups(["authentication.k8s.io"]) + + policyRule.withResources([ + "tokenreviews", + ]) + + policyRule.withVerbs(["create"]); + +local authorizationRole = policyRule.new() + + policyRule.withApiGroups(["authorization.k8s.io"]) + + policyRule.withResources([ + "subjectaccessreviews", + ]) + + policyRule.withVerbs(["create"]); + +local rules = [authenticationRole, authorizationRole]; + +{ + new():: + clusterRole.new() + + clusterRole.mixin.metadata.withName("node-exporter") + + clusterRole.withRules(rules) +} diff --git a/jsonnet/node-exporter/node-exporter-daemonset.libsonnet b/jsonnet/node-exporter/node-exporter-daemonset.libsonnet new file mode 100644 index 00000000..ac642891 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter-daemonset.libsonnet @@ -0,0 +1,58 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; + +local daemonset = k.apps.v1beta2.daemonSet; +local container = daemonset.mixin.spec.template.spec.containersType; +local volume = daemonset.mixin.spec.template.spec.volumesType; +local containerPort = container.portsType; +local containerVolumeMount = container.volumeMountsType; +local podSelector = daemonset.mixin.spec.template.spec.selectorType; + +local nodeExporterVersion = "v0.15.2"; +local kubeRbacProxyVersion = "v0.3.0"; +local podLabels = {"app": "node-exporter"}; + +local procVolumeName = "proc"; +local procVolume = volume.fromHostPath(procVolumeName, "/proc"); +local procVolumeMount = containerVolumeMount.new(procVolumeName, "/host/proc"); + +local sysVolumeName = "sys"; +local sysVolume = volume.fromHostPath(sysVolumeName, "/sys"); +local sysVolumeMount = containerVolumeMount.new(sysVolumeName, "/host/sys"); + +local nodeExporter = + container.new("node-exporter", "quay.io/prometheus/node-exporter:" + nodeExporterVersion) + + container.withArgs([ + "--web.listen-address=127.0.0.1:9101", + "--path.procfs=/host/proc", + "--path.sysfs=/host/sys", + ]) + + container.withVolumeMounts([procVolumeMount, sysVolumeMount]) + + container.mixin.resources.withRequests({cpu: "102m", memory: "180Mi"}) + + container.mixin.resources.withLimits({cpu: "102m", memory: "180Mi"}); + +local proxy = + container.new("kube-rbac-proxy", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + + container.withArgs([ + "--secure-listen-address=:9100", + "--upstream=http://127.0.0.1:9101/", + ]) + + container.withPorts(containerPort.newNamed("https", 9100)) + + container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + + container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); + +local c = [nodeExporter, proxy]; + +{ + new(namespace):: + daemonset.new() + + daemonset.mixin.metadata.withName("node-exporter") + + daemonset.mixin.metadata.withNamespace(namespace) + + daemonset.mixin.metadata.withLabels(podLabels) + + daemonset.mixin.spec.selector.withMatchLabels(podLabels) + + daemonset.mixin.spec.template.metadata.withLabels(podLabels) + + daemonset.mixin.spec.template.spec.withContainers(c) + + daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume]) + + daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + daemonset.mixin.spec.template.spec.withServiceAccountName("node-exporter") +} diff --git a/jsonnet/node-exporter/node-exporter-service-account.libsonnet b/jsonnet/node-exporter/node-exporter-service-account.libsonnet new file mode 100644 index 00000000..f75a6827 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter-service-account.libsonnet @@ -0,0 +1,8 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local serviceAccount = k.core.v1.serviceAccount; + +{ + new(namespace):: + serviceAccount.new("node-exporter") + + serviceAccount.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/node-exporter/node-exporter-service.libsonnet b/jsonnet/node-exporter/node-exporter-service.libsonnet new file mode 100644 index 00000000..addbc598 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter-service.libsonnet @@ -0,0 +1,14 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +local nodeExporterDaemonset = import "node-exporter-daemonset.libsonnet"; + +local nodeExporterPort = servicePort.newNamed("https", 9100, "https"); + +{ + new(namespace):: + service.new("node-exporter", nodeExporterDaemonset.new(namespace).spec.selector.matchLabels, nodeExporterPort) + + service.mixin.metadata.withNamespace(namespace) + + service.mixin.metadata.withLabels({"k8s-app": "node-exporter"}) +} diff --git a/jsonnet/node-exporter/node-exporter.libsonnet b/jsonnet/node-exporter/node-exporter.libsonnet new file mode 100644 index 00000000..57e67911 --- /dev/null +++ b/jsonnet/node-exporter/node-exporter.libsonnet @@ -0,0 +1,7 @@ +{ + clusterRoleBinding:: import "node-exporter-cluster-role-binding.libsonnet", + clusterRole:: import "node-exporter-cluster-role.libsonnet", + daemonset:: import "node-exporter-daemonset.libsonnet", + serviceAccount:: import "node-exporter-service-account.libsonnet", + service:: import "node-exporter-service.libsonnet", +} diff --git a/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet new file mode 100644 index 00000000..64453c52 --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet @@ -0,0 +1,12 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + +{ + new(namespace):: + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName("prometheus-operator") + + clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + clusterRoleBinding.mixin.roleRef.withName("prometheus-operator") + + clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + + clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "prometheus-operator", namespace: namespace}]) +} diff --git a/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet new file mode 100644 index 00000000..db8bcd7b --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet @@ -0,0 +1,80 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRole = k.rbac.v1.clusterRole; +local policyRule = clusterRole.rulesType; + +local extensionsRule = policyRule.new() + + policyRule.withApiGroups(["extensions"]) + + policyRule.withResources([ + "thirdpartyresources", + ]) + + policyRule.withVerbs(["*"]); + +local apiExtensionsRule = policyRule.new() + + policyRule.withApiGroups(["apiextensions.k8s.io"]) + + policyRule.withResources([ + "customresourcedefinitions", + ]) + + policyRule.withVerbs(["*"]); + +local monitoringRule = policyRule.new() + + policyRule.withApiGroups(["monitoring.coreos.com"]) + + policyRule.withResources([ + "alertmanagers", + "prometheuses", + "prometheuses/finalizers", + "alertmanagers/finalizers", + "servicemonitors", + ]) + + policyRule.withVerbs(["*"]); + +local appsRule = policyRule.new() + + policyRule.withApiGroups(["apps"]) + + policyRule.withResources([ + "statefulsets", + ]) + + policyRule.withVerbs(["*"]); + +local coreRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "configmaps", + "secrets", + ]) + + policyRule.withVerbs(["*"]); + +local podRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "pods", + ]) + + policyRule.withVerbs(["list", "delete"]); + +local routingRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "services", + ]) + + policyRule.withVerbs(["get", "create", "update"]); + +local nodeRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "nodes", + ]) + + policyRule.withVerbs(["list", "watch"]); + +local namespaceRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "namespaces", + ]) + + policyRule.withVerbs(["list"]); + +local rules = [extensionsRule, apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; + +{ + new():: + clusterRole.new() + + clusterRole.mixin.metadata.withName("prometheus-operator") + + clusterRole.withRules(rules) +} diff --git a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet new file mode 100644 index 00000000..2ad7f526 --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet @@ -0,0 +1,30 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local rawVersion = importstr "../../../../VERSION"; + +local removeLineBreaks = function(str) std.join("", std.filter(function(c) c != "\n", std.stringChars(str))); +local version = removeLineBreaks(rawVersion); + +local deployment = k.apps.v1beta2.deployment; +local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; +local containerPort = container.portsType; + +local targetPort = 8080; +local podLabels = {"k8s-app": "prometheus-operator"}; + +local operatorContainer = + container.new("prometheus-operator", "quay.io/coreos/prometheus-operator:v" + version) + + container.withPorts(containerPort.newNamed("http", targetPort)) + + container.withArgs(["--kubelet-service=kube-system/kubelet", "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"]) + + container.mixin.resources.withRequests({cpu: "100m", memory: "50Mi"}) + + container.mixin.resources.withLimits({cpu: "200m", memory: "100Mi"}); + +{ + new(namespace):: + deployment.new("prometheus-operator", 1, operatorContainer, podLabels) + + deployment.mixin.metadata.withNamespace(namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels) + + deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + deployment.mixin.spec.template.spec.withServiceAccountName("prometheus-operator") +} diff --git a/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet new file mode 100644 index 00000000..791ce93c --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet @@ -0,0 +1,8 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local serviceAccount = k.core.v1.serviceAccount; + +{ + new(namespace):: + serviceAccount.new("prometheus-operator") + + serviceAccount.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet new file mode 100644 index 00000000..8bbd1477 --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet @@ -0,0 +1,14 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +local poDeployment = import "prometheus-operator-deployment.libsonnet"; + +local poServicePort = servicePort.newNamed("http", 8080, "http"); + + +{ + new(namespace):: + service.new("prometheus-operator", poDeployment.new(namespace).spec.selector.matchLabels, [poServicePort]) + + service.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/prometheus-operator/prometheus-operator.libsonnet new file mode 100644 index 00000000..849acbbf --- /dev/null +++ b/jsonnet/prometheus-operator/prometheus-operator.libsonnet @@ -0,0 +1,7 @@ +{ + clusterRoleBinding:: import "prometheus-operator-cluster-role-binding.libsonnet", + clusterRole:: import "prometheus-operator-cluster-role.libsonnet", + deployment:: import "prometheus-operator-deployment.libsonnet", + serviceAccount:: import "prometheus-operator-service-account.libsonnet", + service:: import "prometheus-operator-service.libsonnet", +} diff --git a/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet b/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet new file mode 100644 index 00000000..d577bee5 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet @@ -0,0 +1,12 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + +{ + new(namespace):: + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName("prometheus-k8s") + + clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + clusterRoleBinding.mixin.roleRef.withName("prometheus-k8s") + + clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + + clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "prometheus-k8s", namespace: namespace}]) +} diff --git a/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet b/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet new file mode 100644 index 00000000..c514624c --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet @@ -0,0 +1,21 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local clusterRole = k.rbac.v1.clusterRole; +local policyRule = clusterRole.rulesType; + +local nodeMetricsRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources(["nodes/metrics"]) + + policyRule.withVerbs(["get"]); + +local metricsRule = policyRule.new() + + policyRule.withNonResourceUrls("/metrics") + + policyRule.withVerbs(["get"]); + +local rules = [nodeMetricsRule, metricsRule]; + +{ + new():: + clusterRole.new() + + clusterRole.mixin.metadata.withName("prometheus-k8s") + + clusterRole.withRules(rules) +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet new file mode 100644 index 00000000..2319aa35 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; + +{ + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet new file mode 100644 index 00000000..f5d38ce7 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; + +{ + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet new file mode 100644 index 00000000..04c481ca --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; + +{ + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet new file mode 100644 index 00000000..7833f785 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; + +{ + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet new file mode 100644 index 00000000..abd43433 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet @@ -0,0 +1,18 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local role = k.rbac.v1.role; +local policyRule = role.rulesType; + +local configmapRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "configmaps", + ]) + + policyRule.withVerbs(["get"]); + +{ + new(namespace):: + role.new() + + role.mixin.metadata.withName("prometheus-k8s-config") + + role.mixin.metadata.withNamespace(namespace) + + role.withRules(configmapRule), +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet new file mode 100644 index 00000000..a9abbb1a --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; + +{ + new():: prometheusNamespaceRole.new("default") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet new file mode 100644 index 00000000..f1ee9860 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; + +{ + new():: prometheusNamespaceRole.new("kube-system") +} diff --git a/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet new file mode 100644 index 00000000..3149cbf0 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet @@ -0,0 +1,5 @@ +local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; + +{ + new(namespace):: prometheusNamespaceRole.new(namespace) +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet new file mode 100644 index 00000000..e8164556 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet @@ -0,0 +1,8 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local serviceAccount = k.core.v1.serviceAccount; + +{ + new(namespace):: + serviceAccount.new("prometheus-k8s") + + serviceAccount.mixin.metadata.withNamespace(namespace) +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet new file mode 100644 index 00000000..5f13a2b4 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet @@ -0,0 +1,32 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "alertmanager", + "namespace": namespace, + "labels": { + "k8s-app": "alertmanager" + } + }, + "spec": { + "selector": { + "matchLabels": { + "alertmanager": "main" + } + }, + "namespaceSelector": { + "matchNames": [ + "monitoring" + ] + }, + "endpoints": [ + { + "port": "web", + "interval": "30s" + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet new file mode 100644 index 00000000..e53ed231 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet @@ -0,0 +1,40 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "kube-apiserver", + "namespace": namespace, + "labels": { + "k8s-app": "apiserver" + } + }, + "spec": { + "jobLabel": "component", + "selector": { + "matchLabels": { + "component": "apiserver", + "provider": "kubernetes" + } + }, + "namespaceSelector": { + "matchNames": [ + "default" + ] + }, + "endpoints": [ + { + "port": "https", + "interval": "30s", + "scheme": "https", + "tlsConfig": { + "caFile": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + "serverName": "kubernetes" + }, + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet new file mode 100644 index 00000000..89afb452 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet @@ -0,0 +1,35 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "coredns", + "namespace": namespace, + "labels": { + "k8s-app": "coredns" + }, + }, + "spec": { + "jobLabel": "k8s-app", + "selector": { + "matchLabels": { + "k8s-app": "coredns", + "component": "metrics" + } + }, + "namespaceSelector": { + "matchNames": [ + "kube-system" + ] + }, + "endpoints": [ + { + "port": "http-metrics", + "interval": "15s", + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet new file mode 100644 index 00000000..447e8a4b --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet @@ -0,0 +1,33 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "kube-controller-manager", + "namespace": namespace, + "labels": { + "k8s-app": "kube-controller-manager" + } + }, + "spec": { + "jobLabel": "k8s-app", + "endpoints": [ + { + "port": "http-metrics", + "interval": "30s" + } + ], + "selector": { + "matchLabels": { + "k8s-app": "kube-controller-manager" + } + }, + "namespaceSelector": { + "matchNames": [ + "kube-system" + ] + } + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet new file mode 100644 index 00000000..eaae0c39 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet @@ -0,0 +1,33 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "kube-scheduler", + "namespace": namespace, + "labels": { + "k8s-app": "kube-scheduler" + } + }, + "spec": { + "jobLabel": "k8s-app", + "endpoints": [ + { + "port": "http-metrics", + "interval": "30s" + } + ], + "selector": { + "matchLabels": { + "k8s-app": "kube-scheduler" + } + }, + "namespaceSelector": { + "matchNames": [ + "kube-system" + ] + } + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet new file mode 100644 index 00000000..3d24aec3 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet @@ -0,0 +1,48 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "kube-state-metrics", + "namespace": namespace, + "labels": { + "k8s-app": "kube-state-metrics" + } + }, + "spec": { + "jobLabel": "k8s-app", + "selector": { + "matchLabels": { + "k8s-app": "kube-state-metrics" + } + }, + "namespaceSelector": { + "matchNames": [ + "monitoring" + ] + }, + "endpoints": [ + { + "port": "https-main", + "scheme": "https", + "interval": "30s", + "honorLabels": true, + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", + "tlsConfig": { + "insecureSkipVerify": true + } + }, + { + "port": "https-self", + "scheme": "https", + "interval": "30s", + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", + "tlsConfig": { + "insecureSkipVerify": true + } + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet new file mode 100644 index 00000000..6b7dd28e --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet @@ -0,0 +1,49 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "kubelet", + "namespace": namespace, + "labels": { + "k8s-app": "kubelet" + } + }, + "spec": { + "jobLabel": "k8s-app", + "endpoints": [ + { + "port": "https-metrics", + "scheme": "https", + "interval": "30s", + "tlsConfig": { + "insecureSkipVerify": true + }, + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" + }, + { + "port": "https-metrics", + "scheme": "https", + "path": "/metrics/cadvisor", + "interval": "30s", + "honorLabels": true, + "tlsConfig": { + "insecureSkipVerify": true + }, + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" + } + ], + "selector": { + "matchLabels": { + "k8s-app": "kubelet" + } + }, + "namespaceSelector": { + "matchNames": [ + "kube-system" + ] + } + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet new file mode 100644 index 00000000..d1ff25e7 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet @@ -0,0 +1,38 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "node-exporter", + "namespace": namespace, + "labels": { + "k8s-app": "node-exporter" + } + }, + "spec": { + "jobLabel": "k8s-app", + "selector": { + "matchLabels": { + "k8s-app": "node-exporter" + } + }, + "namespaceSelector": { + "matchNames": [ + "monitoring" + ] + }, + "endpoints": [ + { + "port": "https", + "scheme": "https", + "interval": "30s", + "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", + "tlsConfig": { + "insecureSkipVerify": true + } + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet new file mode 100644 index 00000000..07613f8c --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet @@ -0,0 +1,26 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "prometheus-operator", + "namespace": namespace, + "labels": { + "k8s-app": "prometheus-operator" + } + }, + "spec": { + "endpoints": [ + { + "port": "http" + } + ], + "selector": { + "matchLabels": { + "k8s-app": "prometheus-operator" + } + } + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet new file mode 100644 index 00000000..0f4ef084 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet @@ -0,0 +1,32 @@ +{ + new(namespace):: + { + "apiVersion": "monitoring.coreos.com/v1", + "kind": "ServiceMonitor", + "metadata": { + "name": "prometheus", + "namespace": namespace, + "labels": { + "k8s-app": "prometheus" + } + }, + "spec": { + "selector": { + "matchLabels": { + "prometheus": "k8s" + } + }, + "namespaceSelector": { + "matchNames": [ + "monitoring" + ] + }, + "endpoints": [ + { + "port": "web", + "interval": "30s" + } + ] + } + } +} diff --git a/jsonnet/prometheus/prometheus-k8s-service.libsonnet b/jsonnet/prometheus/prometheus-k8s-service.libsonnet new file mode 100644 index 00000000..96781d69 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s-service.libsonnet @@ -0,0 +1,13 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +local prometheusPort = servicePort.newNamed("web", 9090, "web"); + + +{ + new(namespace):: + service.new("prometheus-k8s", {app: "prometheus", prometheus: "k8s"}, prometheusPort) + + service.mixin.metadata.withNamespace(namespace) + + service.mixin.metadata.withLabels({prometheus: "k8s"}) +} diff --git a/jsonnet/prometheus/prometheus-k8s.libsonnet b/jsonnet/prometheus/prometheus-k8s.libsonnet new file mode 100644 index 00000000..853f62b1 --- /dev/null +++ b/jsonnet/prometheus/prometheus-k8s.libsonnet @@ -0,0 +1,43 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; + +local container = k.core.v1.pod.mixin.spec.containersType; +local resourceRequirements = container.mixin.resourcesType; +local selector = k.apps.v1beta2.deployment.mixin.spec.selectorType; + +local resources = resourceRequirements.new() + + resourceRequirements.withRequests({memory: "400Mi"}); + +{ + new(namespace):: + { + apiVersion: "monitoring.coreos.com/v1", + kind: "Prometheus", + metadata: { + name: "k8s", + namespace: namespace, + labels: { + prometheus: "k8s", + }, + }, + spec: { + replicas: 2, + version: "v2.2.1", + serviceAccountName: "prometheus-k8s", + serviceMonitorSelector: selector.withMatchExpressions({key: "k8s-app", operator: "Exists"}), + ruleSelector: selector.withMatchLabels({ + role: "alert-rules", + prometheus: "k8s", + }), + resources: resources, + alerting: { + alertmanagers: [ + { + namespace: "monitoring", + name: "alertmanager-main", + port: "web", + }, + ], + }, + }, + } +} diff --git a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet new file mode 100644 index 00000000..8b255fa0 --- /dev/null +++ b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet @@ -0,0 +1,13 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local roleBinding = k.rbac.v1.roleBinding; + +{ + new(serviceAccountNamespace, namespace, name):: + roleBinding.new() + + roleBinding.mixin.metadata.withName(name) + + roleBinding.mixin.metadata.withNamespace(namespace) + + roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + + roleBinding.mixin.roleRef.withName(name) + + roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + + roleBinding.withSubjects([{kind: "ServiceAccount", name: name, namespace: serviceAccountNamespace}]) +} diff --git a/jsonnet/prometheus/prometheus-namespace-role.libsonnet b/jsonnet/prometheus/prometheus-namespace-role.libsonnet new file mode 100644 index 00000000..5afdcff4 --- /dev/null +++ b/jsonnet/prometheus/prometheus-namespace-role.libsonnet @@ -0,0 +1,21 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local role = k.rbac.v1.role; +local policyRule = role.rulesType; + +{ + new(namespace):: + local coreRule = policyRule.new() + + policyRule.withApiGroups([""]) + + policyRule.withResources([ + "nodes", + "services", + "endpoints", + "pods", + ]) + + policyRule.withVerbs(["get", "list", "watch"]); + + role.new() + + role.mixin.metadata.withName("prometheus-k8s") + + role.mixin.metadata.withNamespace(namespace) + + role.withRules(coreRule) +} diff --git a/jsonnet/prometheus/prometheus.libsonnet b/jsonnet/prometheus/prometheus.libsonnet new file mode 100644 index 00000000..edc75c08 --- /dev/null +++ b/jsonnet/prometheus/prometheus.libsonnet @@ -0,0 +1,25 @@ +{ + clusterRoleBinding:: import "prometheus-k8s-cluster-role-binding.libsonnet", + clusterRole:: import "prometheus-k8s-cluster-role.libsonnet", + roleBindingConfig:: import "prometheus-k8s-role-binding-config.libsonnet", + roleBindingNamespace:: import "prometheus-k8s-role-binding-namespace.libsonnet", + roleBindingKubeSystem:: import "prometheus-k8s-role-binding-kube-system.libsonnet", + roleBindingDefault:: import "prometheus-k8s-role-binding-default.libsonnet", + roleConfig:: import "prometheus-k8s-role-config.libsonnet", + roleNamespace:: import "prometheus-k8s-role-namespace.libsonnet", + roleKubeSystem:: import "prometheus-k8s-role-kube-system.libsonnet", + roleDefault:: import "prometheus-k8s-role-default.libsonnet", + serviceAccount:: import "prometheus-k8s-service-account.libsonnet", + serviceMonitorAlertmanager:: import "prometheus-k8s-service-monitor-alertmanager.libsonnet", + serviceMonitorApiserver:: import "prometheus-k8s-service-monitor-apiserver.libsonnet", + serviceMonitorCoreDNS:: import "prometheus-k8s-service-monitor-coredns.libsonnet", + serviceMonitorControllerManager:: import "prometheus-k8s-service-monitor-kube-controller-manager.libsonnet", + serviceMonitorScheduler:: import "prometheus-k8s-service-monitor-kube-scheduler.libsonnet", + serviceMonitorKubeStateMetrics:: import "prometheus-k8s-service-monitor-kube-state-metrics.libsonnet", + serviceMonitorKubelet:: import "prometheus-k8s-service-monitor-kubelet.libsonnet", + serviceMonitorNodeExporter:: import "prometheus-k8s-service-monitor-node-exporter.libsonnet", + serviceMonitorPrometheusOperator:: import "prometheus-k8s-service-monitor-prometheus-operator.libsonnet", + serviceMonitorPrometheus:: import "prometheus-k8s-service-monitor-prometheus.libsonnet", + service:: import "prometheus-k8s-service.libsonnet", + prometheus:: import "prometheus-k8s.libsonnet", +} From e7e23cd81f3497656bc8822e7b126e12fcef2525 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Thu, 22 Mar 2018 13:03:37 +0100 Subject: [PATCH 228/638] contrib: add Thanos example manifests --- manifests/thanos/prometheus-self.yaml | 73 ++++++++++++++++++++++++++ manifests/thanos/query.yaml | 51 ++++++++++++++++++ manifests/thanos/thanos-peers-svc.yaml | 14 +++++ 3 files changed, 138 insertions(+) create mode 100644 manifests/thanos/prometheus-self.yaml create mode 100644 manifests/thanos/query.yaml create mode 100644 manifests/thanos/thanos-peers-svc.yaml diff --git a/manifests/thanos/prometheus-self.yaml b/manifests/thanos/prometheus-self.yaml new file mode 100644 index 00000000..e778905a --- /dev/null +++ b/manifests/thanos/prometheus-self.yaml @@ -0,0 +1,73 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: self + labels: + prometheus: self +spec: + podMetadata: + labels: + thanos-peer: 'true' + replicas: 2 + version: v2.2.1 + serviceAccountName: prometheus-k8s + serviceMonitorSelector: + matchLabels: + app: prometheus + ruleSelector: + matchLabels: + role: prometheus-rulefiles + prometheus: k8s + resources: + requests: + # 2Gi is default, but won't schedule if you don't have a node with >2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + containers: + - name: thanos + image: improbable/thanos:latest + args: + - "sidecar" + - "--log.level=debug" + - "--cluster.peers=thanos-peers.default.svc:10900" + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + - name: cluster + containerPort: 10900 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus + labels: + app: prometheus +spec: + selector: + matchLabels: + app: prometheus + endpoints: + - port: web + interval: 30s +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + prometheus: self + name: prometheus-self +spec: + type: NodePort + ports: + - name: web + nodePort: 30900 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: self diff --git a/manifests/thanos/query.yaml b/manifests/thanos/query.yaml new file mode 100644 index 00000000..eb1d99ba --- /dev/null +++ b/manifests/thanos/query.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-query + labels: + app: thanos-query + thanos-peer: "true" +spec: + replicas: 2 + selector: + matchLabels: + app: thanos-query + thanos-peer: "true" + template: + metadata: + labels: + app: thanos-query + thanos-peer: "true" + spec: + containers: + - name: thanos-query + image: improbable/thanos:latest + args: + - "query" + - "--log.level=debug" + - "--query.replica-label=prometheus_replica" + - "--cluster.peers=thanos-peers.default.svc:10900" + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + - name: cluster + containerPort: 10900 +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: thanos-query + name: thanos-query +spec: + type: NodePort + selector: + app: thanos-query + ports: + - port: 9090 + protocol: TCP + targetPort: http + name: http-query + nodePort: 31111 \ No newline at end of file diff --git a/manifests/thanos/thanos-peers-svc.yaml b/manifests/thanos/thanos-peers-svc.yaml new file mode 100644 index 00000000..afcfcfe4 --- /dev/null +++ b/manifests/thanos/thanos-peers-svc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: thanos-peers +spec: + type: ClusterIP + clusterIP: None + ports: + - name: cluster + port: 10900 + targetPort: cluster + selector: + # Useful endpoint for gathering all thanos components for common gossip cluster. + thanos-peer: "true" \ No newline at end of file From 507617e150d163af65628cd914234d3871bdffa1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 10 Apr 2018 10:51:00 +0200 Subject: [PATCH 229/638] Remove old manifests and replace with jsonnet build --- Makefile | 4 +- README.md | 68 +- build.sh | 19 - example-dist/base/kube-prometheus.jsonnet | 6 + example-dist/bootkube/.gitignore | 2 + example-dist/bootkube/kube-prometheus.jsonnet | 36 + example-dist/kubeadm/.gitignore | 2 + example-dist/kubeadm/kube-prometheus.jsonnet | 31 + .../basic-auth/secrets.yaml | 0 .../basic-auth/service-monitor.yaml | 0 .../example-app/example-app.yaml | 0 ...d-alertmanager-discovery-role-binding.yaml | 0 ...-frontend-alertmanager-discovery-role.yaml | 0 .../prometheus-frontend-role-binding.yaml | 0 .../example-app/prometheus-frontend-role.yaml | 0 .../prometheus-frontend-service-account.yaml | 0 .../example-app/prometheus-frontend-svc.yaml | 0 .../example-app/prometheus-frontend.yaml | 0 .../example-app/servicemonitor-frontend.yaml | 0 .../custom-metrics-api/.gitignore | 0 .../custom-metrics-api/README.md | 0 ...r-auth-delegator-cluster-role-binding.yaml | 0 ...cs-apiserver-auth-reader-role-binding.yaml | 0 .../custom-metrics-apiserver-deployment.yaml | 0 ...-resource-reader-cluster-role-binding.yaml | 0 ...tom-metrics-apiserver-service-account.yaml | 0 .../custom-metrics-apiserver-service.yaml | 0 .../custom-metrics-apiservice.yaml | 0 .../custom-metrics-cluster-role.yaml | 0 ...-metrics-resource-reader-cluster-role.yaml | 0 .../custom-metrics-api/deploy.sh | 0 .../custom-metrics-api/gencerts.sh | 0 ...a-custom-metrics-cluster-role-binding.yaml | 0 .../custom-metrics-api/teardown.sh | 0 .../metrics-server/auth-delegator.yaml | 0 .../metrics-server/auth-reader.yaml | 0 .../metrics-server/metrics-apiservice.yaml | 0 .../metrics-server-cluster-role-binding.yaml | 0 .../metrics-server-cluster-role.yaml | 0 .../metrics-server-deployment.yaml | 0 .../metrics-server-service-account.yaml | 0 .../metrics-server-service.yaml | 0 hack/cluster-monitoring/deploy | 44 +- hack/cluster-monitoring/minikube-deploy | 17 - hack/cluster-monitoring/minikube-teardown | 6 - hack/cluster-monitoring/self-hosted-deploy | 6 - hack/cluster-monitoring/self-hosted-teardown | 6 - hack/cluster-monitoring/teardown | 28 +- hack/example-service-monitoring/deploy | 2 +- hack/example-service-monitoring/teardown | 2 +- hack/scripts/build-jsonnet.sh | 25 + .../generate-alertmanager-config-secret.sh | 11 - hack/scripts/generate-dashboards-configmap.sh | 39 - .../generate-grafana-credentials-secret.sh | 20 - hack/scripts/generate-manifests.sh | 26 - hack/scripts/generate-rules-configmap.sh | 18 - hack/scripts/wrap-dashboard.sh | 51 - .../alertmanager-main-secret.libsonnet | 21 +- jsonnet/kube-prometheus.jsonnet | 62 - jsonnet/kube-prometheus.libsonnet | 85 + .../prometheus-operator-deployment.libsonnet | 4 +- .../prometheus/prometheus-k8s-rules.libsonnet | 8 + .../prometheus-k8s-service.libsonnet | 1 - jsonnet/prometheus/prometheus.libsonnet | 1 + .../alertmanager-main-secret.yaml} | 6 +- .../alertmanager-main-service-account.yaml | 5 + .../alertmanager-main-service.yaml} | 5 +- .../alertmanager-main.yaml} | 4 +- manifests/etcd/etcd-bootkube-gce.yaml | 28 - .../etcd/etcd-bootkube-vagrant-multi.yaml | 28 - manifests/grafana/grafana-credentials.yaml | 7 - .../grafana-dashboard-definitions.yaml | 12757 +++++++--------- .../grafana/grafana-dashboard-sources.yaml | 18 + manifests/grafana/grafana-dashboards.yaml | 12 - manifests/grafana/grafana-datasources.yaml | 27 +- manifests/grafana/grafana-deployment.yaml | 75 +- .../grafana/grafana-service-account.yaml | 5 + manifests/grafana/grafana-service.yaml | 11 +- .../k8s/kubeadm/kube-controller-manager.yaml | 17 - manifests/k8s/kubeadm/kube-scheduler.yaml | 17 - .../self-hosted/kube-controller-manager.yaml | 17 - manifests/k8s/self-hosted/kube-dns.yaml | 21 - manifests/k8s/self-hosted/kube-scheduler.yaml | 17 - ...be-state-metrics-cluster-role-binding.yaml | 2 +- .../kube-state-metrics-cluster-role.yaml | 51 +- .../kube-state-metrics-deployment.yaml | 149 +- .../kube-state-metrics-role-binding.yaml | 6 +- .../kube-state-metrics-role.yaml | 22 +- .../kube-state-metrics-service-account.yaml | 1 + .../kube-state-metrics-service.yaml | 6 +- .../node-exporter-cluster-role.yaml | 12 +- .../node-exporter-daemonset.yaml | 102 +- .../node-exporter-service-account.yaml | 1 + .../node-exporter/node-exporter-service.yaml | 7 +- .../prometheus-k8s-cluster-role-binding.yaml | 12 + .../prometheus-k8s-cluster-role.yaml | 15 + .../prometheus-k8s-role-binding-config.yaml | 13 + .../prometheus-k8s-role-binding-default.yaml | 13 + ...ometheus-k8s-role-binding-kube-system.yaml | 13 + ...prometheus-k8s-role-binding-namespace.yaml | 13 + .../prometheus-k8s-role-config.yaml | 12 + .../prometheus-k8s-role-default.yaml | 17 + .../prometheus-k8s-role-kube-system.yaml | 17 + .../prometheus-k8s-role-namespace.yaml | 17 + .../prometheus-k8s-rules.yaml | 180 +- .../prometheus-k8s-service-account.yaml | 1 + ...heus-k8s-service-monitor-alertmanager.yaml | 15 +- ...metheus-k8s-service-monitor-apiserver.yaml | 23 +- ...rometheus-k8s-service-monitor-coredns.yaml | 19 +- ...rvice-monitor-kube-controller-manager.yaml | 15 +- ...us-k8s-service-monitor-kube-scheduler.yaml | 15 +- ...8s-service-monitor-kube-state-metrics.yaml | 37 +- ...rometheus-k8s-service-monitor-kubelet.yaml | 27 +- ...eus-k8s-service-monitor-node-exporter.yaml | 23 +- ...s-service-monitor-prometheus-operator.yaml | 3 +- ...etheus-k8s-service-monitor-prometheus.yaml | 15 +- .../prometheus-k8s-service.yaml | 5 +- .../prometheus-k8s.yaml | 36 +- ...metheus-operator-cluster-role-binding.yaml | 2 +- .../prometheus-operator-cluster-role.yaml | 46 +- ...ml => prometheus-operator-deployment.yaml} | 1 + .../prometheus-operator-service-account.yaml | 1 + .../prometheus-operator-service.yaml | 5 +- .../prometheus-k8s-role-bindings.yaml | 54 - .../prometheus/prometheus-k8s-roles.yaml | 55 - ...rometheus-k8s-service-coredns-metrics.yaml | 18 - requirements.txt | 1 - 127 files changed, 6332 insertions(+), 8494 deletions(-) delete mode 100755 build.sh create mode 100644 example-dist/base/kube-prometheus.jsonnet create mode 100644 example-dist/bootkube/.gitignore create mode 100644 example-dist/bootkube/kube-prometheus.jsonnet create mode 100644 example-dist/kubeadm/.gitignore create mode 100644 example-dist/kubeadm/kube-prometheus.jsonnet rename {manifests/examples => examples}/basic-auth/secrets.yaml (100%) rename {manifests/examples => examples}/basic-auth/service-monitor.yaml (100%) rename {manifests/examples => examples}/example-app/example-app.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-alertmanager-discovery-role.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-role-binding.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-role.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-service-account.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend-svc.yaml (100%) rename {manifests/examples => examples}/example-app/prometheus-frontend.yaml (100%) rename {manifests/examples => examples}/example-app/servicemonitor-frontend.yaml (100%) rename {manifests => experimental}/custom-metrics-api/.gitignore (100%) rename {manifests => experimental}/custom-metrics-api/README.md (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-deployment.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-service-account.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiserver-service.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-apiservice.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-cluster-role.yaml (100%) rename {manifests => experimental}/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml (100%) rename {manifests => experimental}/custom-metrics-api/deploy.sh (100%) rename {manifests => experimental}/custom-metrics-api/gencerts.sh (100%) rename {manifests => experimental}/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml (100%) rename {manifests => experimental}/custom-metrics-api/teardown.sh (100%) rename {manifests => experimental}/metrics-server/auth-delegator.yaml (100%) rename {manifests => experimental}/metrics-server/auth-reader.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-apiservice.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-server-cluster-role-binding.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-server-cluster-role.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-server-deployment.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-server-service-account.yaml (100%) rename {manifests => experimental}/metrics-server/metrics-server-service.yaml (100%) delete mode 100755 hack/cluster-monitoring/minikube-deploy delete mode 100755 hack/cluster-monitoring/minikube-teardown delete mode 100755 hack/cluster-monitoring/self-hosted-deploy delete mode 100755 hack/cluster-monitoring/self-hosted-teardown create mode 100755 hack/scripts/build-jsonnet.sh delete mode 100755 hack/scripts/generate-alertmanager-config-secret.sh delete mode 100755 hack/scripts/generate-dashboards-configmap.sh delete mode 100755 hack/scripts/generate-grafana-credentials-secret.sh delete mode 100755 hack/scripts/generate-manifests.sh delete mode 100755 hack/scripts/generate-rules-configmap.sh delete mode 100755 hack/scripts/wrap-dashboard.sh delete mode 100644 jsonnet/kube-prometheus.jsonnet create mode 100644 jsonnet/kube-prometheus.libsonnet create mode 100644 jsonnet/prometheus/prometheus-k8s-rules.libsonnet rename manifests/{alertmanager/alertmanager-config.yaml => alertmanager-main/alertmanager-main-secret.yaml} (91%) create mode 100644 manifests/alertmanager-main/alertmanager-main-service-account.yaml rename manifests/{alertmanager/alertmanager-service.yaml => alertmanager-main/alertmanager-main-service.yaml} (78%) rename manifests/{alertmanager/alertmanager.yaml => alertmanager-main/alertmanager-main.yaml} (70%) delete mode 100644 manifests/etcd/etcd-bootkube-gce.yaml delete mode 100644 manifests/etcd/etcd-bootkube-vagrant-multi.yaml delete mode 100644 manifests/grafana/grafana-credentials.yaml create mode 100644 manifests/grafana/grafana-dashboard-sources.yaml delete mode 100644 manifests/grafana/grafana-dashboards.yaml create mode 100644 manifests/grafana/grafana-service-account.yaml delete mode 100644 manifests/k8s/kubeadm/kube-controller-manager.yaml delete mode 100644 manifests/k8s/kubeadm/kube-scheduler.yaml delete mode 100644 manifests/k8s/self-hosted/kube-controller-manager.yaml delete mode 100644 manifests/k8s/self-hosted/kube-dns.yaml delete mode 100644 manifests/k8s/self-hosted/kube-scheduler.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-config.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-default.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml create mode 100644 manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-rules.yaml (83%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-account.yaml (74%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-alertmanager.yaml (81%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-apiserver.yaml (81%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-coredns.yaml (69%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-kube-controller-manager.yaml (82%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-kube-scheduler.yaml (81%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-kube-state-metrics.yaml (71%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-kubelet.yaml (71%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-node-exporter.yaml (78%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-prometheus-operator.yaml (90%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service-monitor-prometheus.yaml (81%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s-service.yaml (77%) rename manifests/{prometheus => prometheus-k8s}/prometheus-k8s.yaml (54%) rename manifests/prometheus-operator/{prometheus-operator.yaml => prometheus-operator-deployment.yaml} (97%) delete mode 100644 manifests/prometheus/prometheus-k8s-role-bindings.yaml delete mode 100644 manifests/prometheus/prometheus-k8s-roles.yaml delete mode 100644 manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml delete mode 100644 requirements.txt diff --git a/Makefile b/Makefile index 26084ae4..8ff81356 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ image: generate: image @echo ">> Compiling assets and generating Kubernetes manifests" - docker run --rm -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw + docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw generate-raw: - ./hack/scripts/generate-manifests.sh + ./hack/scripts/build-jsonnet.sh example-dist/base/kube-prometheus.jsonnet manifests diff --git a/README.md b/README.md index 4ada050d..7defae27 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # kube-prometheus +> Note that everything in the `contrib/kube-prometheus/` directory is experimental and may change significantly at any time. + This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide single-command deployments of end-to-end @@ -46,16 +48,15 @@ install Simply run: ```bash -export KUBECONFIG= # defaults to "~/.kube/config" cd contrib/kube-prometheus/ hack/cluster-monitoring/deploy ``` -After all pods are ready, you can reach: +After all pods are ready, you can reach each of the UIs by port-forwarding: -* Prometheus UI on node port `30900` -* Alertmanager UI on node port `30903` -* Grafana on node port `30902` +* Prometheus UI on node port `kubectl -n monitoring port-forward prometheus-k8s-0 9090` +* Alertmanager UI on node port `kubectl -n monitoring port-forward alertmanager-main-0 9093` +* Grafana on node port `kubectl -n monitoring port-forward $(kubectl get pods -n monitoring -lapp=grafana -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') 3000` To tear it all down again, run: @@ -63,9 +64,53 @@ To tear it all down again, run: hack/cluster-monitoring/teardown ``` +## Customizing + +As everyone's infrastructure is slightly different, different organizations have different requirements. Thereby there may be modifications you want to do on kube-prometheus to fit your needs. + +The kube-prometheus stack is intended to be a jsonnet library for organizations to consume and use in their own infrastructure repository. Below is an example how it can be used to deploy the stack properly on minikube. + +The three "distribution" examples we have assembled can be found in: + +* `example-dist/base`: contains the plain kube-prometheus stack for organizations to build on. +* `example-dist/kubeadm`: contains the kube-prometheus stack with slight modifications to work properly monitoring kubeadm clusters and exposes UIs on NodePorts for demonstration purposes. +* `example-dist/bootkube`: contains the kube-prometheus stack with slight modifications to work properly on clusters created with bootkube. + +The examples in `example-dist/` are purely meant for demonstration purposes, the `kube-prometheus.jsonnet` file should live in your organizations infrastructure repository and use the kube-prometheus library provided here. + +Examples of additoinal modifications you may want to make could be adding an `Ingress` object for each of the UIs, but the point of this is that as opposed to other solutions out there, this library does not need to yield all possible customization options, it's all up to the user to customize! + +### minikube kubeadm example + +See `example-dist/kubeadm` for an example for deploying on minikube, using the minikube kubeadm bootstrapper. The `example-dist/kubeadm/kube-prometheus.jsonnet` file renders the kube-prometheus manifests using jsonnet and then merges the result with kubeadm specifics, such as information on how to monitor kube-controller-manager and kube-scheduler as created by kubeadm. In addition for demonstration purposes, it converts the services selecting Prometheus, Alertmanager and Grafana to NodePort services. + +Let's give that a try, and create a minikube cluster: + +``` +minikube delete && minikube start --kubernetes-version=v1.9.6 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +``` + +Then we can render the manifests for kubeadm (because we are using the minikube kubeadm bootstrapper): + +``` +docker run --rm \ + -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ + --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ + po-jsonnet \ + ./hack/scripts/build-jsonnet.sh example-dist/kubeadm/kube-prometheus.jsonnet example-dist/kubeadm/manifests +``` + +> Note the `po-jsonnet` docker image is built using [this Dockerfile](/scripts/jsonnet/Dockerfile), you can also build it using `make image` from the `contrib/kube-prometheus` folder. + +Then the stack can be deployed using + +``` +hack/cluster-monitoring/deploy example-dist/kubeadm +``` + ## Monitoring custom services -The example manifests in [manifests/examples/example-app](/contrib/kube-prometheus/manifests/examples/example-app) +The example manifests in [examples/example-app](/contrib/kube-prometheus/examples/example-app) deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/design.md#servicemonitor), which specifies how the example service should be monitored. @@ -76,10 +121,13 @@ manage its life cycle. hack/example-service-monitoring/deploy ``` -After all pods are ready you can reach the Prometheus server on node port `30100` and observe -how it monitors the service as specified. Same as before, this Prometheus server automatically -discovers the Alertmanager cluster deployed in the [Monitoring Kubernetes](#Monitoring-Kubernetes) -section. +After all pods are ready you can reach the Prometheus server similar to the Prometheus server above: + +```bash +kubectl port-forward prometheus-frontend-0 9090 +``` + +Then you can access Prometheus through `http://localhost:9090/`. Teardown: diff --git a/build.sh b/build.sh deleted file mode 100755 index a42a6bb7..00000000 --- a/build.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash -set -e -set -x - -prefix="tmp/manifests" -json="tmp/manifests.json" - -rm -rf ${prefix} -mkdir -p $(dirname "${json}") -jsonnet -J /home/brancz/.jsonnet-bundler/src/git/git@github.com-ksonnet-ksonnet-lib/master jsonnet/kube-prometheus.jsonnet > ${json} - -files=$(jq -r 'keys[]' ${json}) - -for file in ${files}; do - dir=$(dirname "${file}") - path="${prefix}/${dir}" - mkdir -p ${path} - jq -r ".[\"${file}\"]" ${json} | yaml2json | json2yaml > "${prefix}/${file}" -done diff --git a/example-dist/base/kube-prometheus.jsonnet b/example-dist/base/kube-prometheus.jsonnet new file mode 100644 index 00000000..01760e65 --- /dev/null +++ b/example-dist/base/kube-prometheus.jsonnet @@ -0,0 +1,6 @@ +local kubePrometheus = import "kube-prometheus.libsonnet"; + +local namespace = "monitoring"; +local objects = kubePrometheus.new(namespace); + +{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/example-dist/bootkube/.gitignore b/example-dist/bootkube/.gitignore new file mode 100644 index 00000000..4ea90de6 --- /dev/null +++ b/example-dist/bootkube/.gitignore @@ -0,0 +1,2 @@ +tmp/ +manifests/ diff --git a/example-dist/bootkube/kube-prometheus.jsonnet b/example-dist/bootkube/kube-prometheus.jsonnet new file mode 100644 index 00000000..fa731106 --- /dev/null +++ b/example-dist/bootkube/kube-prometheus.jsonnet @@ -0,0 +1,36 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; +local kubePrometheus = import "kube-prometheus.libsonnet"; + +local namespace = "monitoring"; + +local controllerManagerService = service.new("kube-controller-manager-prometheus-discovery", {"k8s-app": "kube-controller-manager"}, servicePort.newNamed("http-metrics", 10252, 10252)) + + service.mixin.metadata.withNamespace("kube-system") + + service.mixin.metadata.withLabels({"k8s-app": "kube-controller-manager"}); + +local schedulerService = service.new("kube-scheduler-prometheus-discovery", {"k8s-app": "kube-scheduler"}, servicePort.newNamed("http-metrics", 10251, 10251)) + + service.mixin.metadata.withNamespace("kube-system") + + service.mixin.metadata.withLabels({"k8s-app": "kube-scheduler"}); + +local kubeDNSService = service.new("kube-dns-prometheus-discovery", {"k8s-app": "kube-dns"}, [servicePort.newNamed("http-metrics-skydns", 10055, 10055), servicePort.newNamed("http-metrics-dnsmasq", 10054, 10054)]) + + service.mixin.metadata.withNamespace("kube-system") + + service.mixin.metadata.withLabels({"k8s-app": "kube-dns"}); + +local objects = kubePrometheus.new(namespace) + + { + "prometheus-k8s/prometheus-k8s-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("web", 9090, "web") + servicePort.withNodePort(30900)) + + service.mixin.spec.withType("NodePort"), + "alertmanager-main/alertmanager-main-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("web", 9093, "web") + servicePort.withNodePort(30903)) + + service.mixin.spec.withType("NodePort"), + "grafana/grafana-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("http", 3000, "http") + servicePort.withNodePort(30902)) + + service.mixin.spec.withType("NodePort"), + "prometheus-k8s/kube-controller-manager-prometheus-discovery-service.yaml": controllerManagerService, + "prometheus-k8s/kube-scheduler-prometheus-discovery-service.yaml": schedulerService, + "prometheus-k8s/kube-dns-prometheus-discovery-service.yaml": kubeDNSService, + }; + +{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/example-dist/kubeadm/.gitignore b/example-dist/kubeadm/.gitignore new file mode 100644 index 00000000..4ea90de6 --- /dev/null +++ b/example-dist/kubeadm/.gitignore @@ -0,0 +1,2 @@ +tmp/ +manifests/ diff --git a/example-dist/kubeadm/kube-prometheus.jsonnet b/example-dist/kubeadm/kube-prometheus.jsonnet new file mode 100644 index 00000000..50ce1020 --- /dev/null +++ b/example-dist/kubeadm/kube-prometheus.jsonnet @@ -0,0 +1,31 @@ +local k = import "ksonnet.beta.3/k.libsonnet"; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; +local kubePrometheus = import "kube-prometheus.libsonnet"; + +local namespace = "monitoring"; + +local controllerManagerService = service.new("kube-controller-manager-prometheus-discovery", {component: "kube-controller-manager"}, servicePort.newNamed("http-metrics", 10252, 10252)) + + service.mixin.metadata.withNamespace("kube-system") + + service.mixin.metadata.withLabels({"k8s-app": "kube-controller-manager"}); + +local schedulerService = service.new("kube-scheduler-prometheus-discovery", {component: "kube-scheduler"}, servicePort.newNamed("http-metrics", 10251, 10251)) + + service.mixin.metadata.withNamespace("kube-system") + + service.mixin.metadata.withLabels({"k8s-app": "kube-scheduler"}); + +local objects = kubePrometheus.new(namespace) + + { + "prometheus-k8s/prometheus-k8s-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("web", 9090, "web") + servicePort.withNodePort(30900)) + + service.mixin.spec.withType("NodePort"), + "alertmanager-main/alertmanager-main-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("web", 9093, "web") + servicePort.withNodePort(30903)) + + service.mixin.spec.withType("NodePort"), + "grafana/grafana-service.yaml"+: + service.mixin.spec.withPorts(servicePort.newNamed("http", 3000, "http") + servicePort.withNodePort(30902)) + + service.mixin.spec.withType("NodePort"), + "prometheus-k8s/kube-controller-manager-prometheus-discovery-service.yaml": controllerManagerService, + "prometheus-k8s/kube-scheduler-prometheus-discovery-service.yaml": schedulerService, + }; + +{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/manifests/examples/basic-auth/secrets.yaml b/examples/basic-auth/secrets.yaml similarity index 100% rename from manifests/examples/basic-auth/secrets.yaml rename to examples/basic-auth/secrets.yaml diff --git a/manifests/examples/basic-auth/service-monitor.yaml b/examples/basic-auth/service-monitor.yaml similarity index 100% rename from manifests/examples/basic-auth/service-monitor.yaml rename to examples/basic-auth/service-monitor.yaml diff --git a/manifests/examples/example-app/example-app.yaml b/examples/example-app/example-app.yaml similarity index 100% rename from manifests/examples/example-app/example-app.yaml rename to examples/example-app/example-app.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml b/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml rename to examples/example-app/prometheus-frontend-alertmanager-discovery-role-binding.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml b/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml rename to examples/example-app/prometheus-frontend-alertmanager-discovery-role.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-role-binding.yaml b/examples/example-app/prometheus-frontend-role-binding.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-role-binding.yaml rename to examples/example-app/prometheus-frontend-role-binding.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-role.yaml b/examples/example-app/prometheus-frontend-role.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-role.yaml rename to examples/example-app/prometheus-frontend-role.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-service-account.yaml b/examples/example-app/prometheus-frontend-service-account.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-service-account.yaml rename to examples/example-app/prometheus-frontend-service-account.yaml diff --git a/manifests/examples/example-app/prometheus-frontend-svc.yaml b/examples/example-app/prometheus-frontend-svc.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend-svc.yaml rename to examples/example-app/prometheus-frontend-svc.yaml diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/examples/example-app/prometheus-frontend.yaml similarity index 100% rename from manifests/examples/example-app/prometheus-frontend.yaml rename to examples/example-app/prometheus-frontend.yaml diff --git a/manifests/examples/example-app/servicemonitor-frontend.yaml b/examples/example-app/servicemonitor-frontend.yaml similarity index 100% rename from manifests/examples/example-app/servicemonitor-frontend.yaml rename to examples/example-app/servicemonitor-frontend.yaml diff --git a/manifests/custom-metrics-api/.gitignore b/experimental/custom-metrics-api/.gitignore similarity index 100% rename from manifests/custom-metrics-api/.gitignore rename to experimental/custom-metrics-api/.gitignore diff --git a/manifests/custom-metrics-api/README.md b/experimental/custom-metrics-api/README.md similarity index 100% rename from manifests/custom-metrics-api/README.md rename to experimental/custom-metrics-api/README.md diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-deployment.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-service-account.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiserver-service.yaml rename to experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-apiservice.yaml b/experimental/custom-metrics-api/custom-metrics-apiservice.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-apiservice.yaml rename to experimental/custom-metrics-api/custom-metrics-apiservice.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-cluster-role.yaml b/experimental/custom-metrics-api/custom-metrics-cluster-role.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-cluster-role.yaml rename to experimental/custom-metrics-api/custom-metrics-cluster-role.yaml diff --git a/manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml b/experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml similarity index 100% rename from manifests/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml rename to experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml diff --git a/manifests/custom-metrics-api/deploy.sh b/experimental/custom-metrics-api/deploy.sh similarity index 100% rename from manifests/custom-metrics-api/deploy.sh rename to experimental/custom-metrics-api/deploy.sh diff --git a/manifests/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh similarity index 100% rename from manifests/custom-metrics-api/gencerts.sh rename to experimental/custom-metrics-api/gencerts.sh diff --git a/manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml b/experimental/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml similarity index 100% rename from manifests/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml rename to experimental/custom-metrics-api/hpa-custom-metrics-cluster-role-binding.yaml diff --git a/manifests/custom-metrics-api/teardown.sh b/experimental/custom-metrics-api/teardown.sh similarity index 100% rename from manifests/custom-metrics-api/teardown.sh rename to experimental/custom-metrics-api/teardown.sh diff --git a/manifests/metrics-server/auth-delegator.yaml b/experimental/metrics-server/auth-delegator.yaml similarity index 100% rename from manifests/metrics-server/auth-delegator.yaml rename to experimental/metrics-server/auth-delegator.yaml diff --git a/manifests/metrics-server/auth-reader.yaml b/experimental/metrics-server/auth-reader.yaml similarity index 100% rename from manifests/metrics-server/auth-reader.yaml rename to experimental/metrics-server/auth-reader.yaml diff --git a/manifests/metrics-server/metrics-apiservice.yaml b/experimental/metrics-server/metrics-apiservice.yaml similarity index 100% rename from manifests/metrics-server/metrics-apiservice.yaml rename to experimental/metrics-server/metrics-apiservice.yaml diff --git a/manifests/metrics-server/metrics-server-cluster-role-binding.yaml b/experimental/metrics-server/metrics-server-cluster-role-binding.yaml similarity index 100% rename from manifests/metrics-server/metrics-server-cluster-role-binding.yaml rename to experimental/metrics-server/metrics-server-cluster-role-binding.yaml diff --git a/manifests/metrics-server/metrics-server-cluster-role.yaml b/experimental/metrics-server/metrics-server-cluster-role.yaml similarity index 100% rename from manifests/metrics-server/metrics-server-cluster-role.yaml rename to experimental/metrics-server/metrics-server-cluster-role.yaml diff --git a/manifests/metrics-server/metrics-server-deployment.yaml b/experimental/metrics-server/metrics-server-deployment.yaml similarity index 100% rename from manifests/metrics-server/metrics-server-deployment.yaml rename to experimental/metrics-server/metrics-server-deployment.yaml diff --git a/manifests/metrics-server/metrics-server-service-account.yaml b/experimental/metrics-server/metrics-server-service-account.yaml similarity index 100% rename from manifests/metrics-server/metrics-server-service-account.yaml rename to experimental/metrics-server/metrics-server-service-account.yaml diff --git a/manifests/metrics-server/metrics-server-service.yaml b/experimental/metrics-server/metrics-server-service.yaml similarity index 100% rename from manifests/metrics-server/metrics-server-service.yaml rename to experimental/metrics-server/metrics-server-service.yaml diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index a4f7c184..41e05187 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -1,40 +1,24 @@ #!/usr/bin/env bash -if [ -z "${KUBECONFIG}" ]; then - export KUBECONFIG=~/.kube/config -fi +manifest_prefix=${1-.} -# CAUTION - setting NAMESPACE will deploy most components to the given namespace -# however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests. +kubectl create namespace monitoring -if [ -z "${NAMESPACE}" ]; then - NAMESPACE=monitoring -fi - -kubectl create namespace "$NAMESPACE" - -kctl() { - kubectl --namespace "$NAMESPACE" "$@" -} - -kctl apply -f manifests/prometheus-operator +kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/ # Wait for CRDs to be ready. printf "Waiting for Operator to register custom resource definitions..." -until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kubectl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" -kctl apply -f manifests/node-exporter -kctl apply -f manifests/kube-state-metrics -kctl apply -f manifests/grafana/grafana-credentials.yaml -kctl apply -f manifests/grafana -find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; -kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml -kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml -kctl apply -f manifests/alertmanager/ +kubectl apply -f ${manifest_prefix}/manifests/node-exporter/ +kubectl apply -f ${manifest_prefix}/manifests/kube-state-metrics/ +kubectl apply -f ${manifest_prefix}/manifests/grafana/ +kubectl apply -f ${manifest_prefix}/manifests/prometheus-k8s/ +kubectl apply -f ${manifest_prefix}/manifests/alertmanager-main/ diff --git a/hack/cluster-monitoring/minikube-deploy b/hack/cluster-monitoring/minikube-deploy deleted file mode 100755 index 64cb86be..00000000 --- a/hack/cluster-monitoring/minikube-deploy +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -# We assume that the kubelet uses token authN and authZ, as otherwise -# Prometheus needs a client certificate, which gives it full access to the -# kubelet, rather than just the metrics. Token authN and authZ allows more fine -# grained and easier access control. Simply start minikube with the following -# command (you can of course adapt the version and memory to your needs): -# -# $ minikube delete && minikube start --kubernetes-version=v1.9.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 -# -# In future versions of minikube and kubeadm this will be the default, but for -# the time being, we will have to configure it ourselves. - -hack/cluster-monitoring/deploy - -kubectl --namespace=kube-system apply -f manifests/k8s/kubeadm/ - diff --git a/hack/cluster-monitoring/minikube-teardown b/hack/cluster-monitoring/minikube-teardown deleted file mode 100755 index 3a4c986e..00000000 --- a/hack/cluster-monitoring/minikube-teardown +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -hack/cluster-monitoring/teardown - -kubectl --namespace=kube-system delete -f manifests/k8s/minikube - diff --git a/hack/cluster-monitoring/self-hosted-deploy b/hack/cluster-monitoring/self-hosted-deploy deleted file mode 100755 index 7cbce37d..00000000 --- a/hack/cluster-monitoring/self-hosted-deploy +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -hack/cluster-monitoring/deploy - -kubectl apply -f manifests/k8s/self-hosted - diff --git a/hack/cluster-monitoring/self-hosted-teardown b/hack/cluster-monitoring/self-hosted-teardown deleted file mode 100755 index f9d7da9f..00000000 --- a/hack/cluster-monitoring/self-hosted-teardown +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -hack/cluster-monitoring/teardown - -kubectl delete -f manifests/k8s/self-hosted - diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index b2c4c544..0ef9a6b3 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -1,30 +1,4 @@ #!/usr/bin/env bash -if [ -z "${KUBECONFIG}" ]; then - export KUBECONFIG=~/.kube/config -fi - -# CAUTION - NAMESPACE must match its value when deploy script was run. -# Some resources are always deployed to the monitoring namespace. - -if [ -z "${NAMESPACE}" ]; then - NAMESPACE=monitoring -fi - -kctl() { - kubectl --namespace "$NAMESPACE" "$@" -} - -kctl delete -f manifests/node-exporter -kctl delete -f manifests/kube-state-metrics -kctl delete -f manifests/grafana -find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; -kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml -kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml -kctl delete -f manifests/alertmanager - -# Hack: wait a bit to let the controller delete the deployed Prometheus server. -sleep 5 - -kctl delete -f manifests/prometheus-operator +kubectl delete namespace monitoring diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy index 18b0ef6a..4912dd96 100755 --- a/hack/example-service-monitoring/deploy +++ b/hack/example-service-monitoring/deploy @@ -1,3 +1,3 @@ #!/usr/bin/env bash -kubectl apply -f manifests/examples/example-app +kubectl apply -f examples/example-app diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown index a5fc1760..62b546de 100755 --- a/hack/example-service-monitoring/teardown +++ b/hack/example-service-monitoring/teardown @@ -1,3 +1,3 @@ #!/usr/bin/env bash -kubectl delete -f manifests/examples/example-app +kubectl delete -f examples/example-app diff --git a/hack/scripts/build-jsonnet.sh b/hack/scripts/build-jsonnet.sh new file mode 100755 index 00000000..7189962f --- /dev/null +++ b/hack/scripts/build-jsonnet.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -e +set -x + +jsonnet="${1-kube-prometheus.jsonnet}" +prefix="${2-manifests}" +json="tmp/manifests.json" + +rm -rf ${prefix} +mkdir -p $(dirname "${json}") +jsonnet \ + -J $GOPATH/src/github.com/ksonnet/ksonnet-lib \ + -J $GOPATH/src/github.com/grafana/grafonnet-lib \ + -J $GOPATH/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet \ + -J $GOPATH/src/github.com/brancz/kubernetes-grafana/src/kubernetes-jsonnet \ + ${jsonnet} > ${json} + +files=$(jq -r 'keys[]' ${json}) + +for file in ${files}; do + dir=$(dirname "${file}") + path="${prefix}/${dir}" + mkdir -p ${path} + jq -r ".[\"${file}\"]" ${json} | gojsontoyaml -yamltojson | gojsontoyaml > "${prefix}/${file}" +done diff --git a/hack/scripts/generate-alertmanager-config-secret.sh b/hack/scripts/generate-alertmanager-config-secret.sh deleted file mode 100755 index b0b4aaef..00000000 --- a/hack/scripts/generate-alertmanager-config-secret.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -cat <<-EOF -apiVersion: v1 -kind: Secret -metadata: - name: alertmanager-main -data: - alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0) -EOF - diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh deleted file mode 100755 index 47ccfc12..00000000 --- a/hack/scripts/generate-dashboards-configmap.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -set -e -set +x - -cat <<-EOF -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-definitions-0 -data: -EOF - -for f in assets/grafana/generated/*-dashboard.json -do - rm -rf $f -done - -virtualenv -p python3 .env 2>&1 > /dev/null -source .env/bin/activate 2>&1 > /dev/null -pip install -Ur requirements.txt 2>&1 > /dev/null -for f in assets/grafana/*.dashboard.py -do - basefilename=$(basename $f) - JSON_FILENAME="assets/grafana/generated/${basefilename%%.*}-dashboard.json" - generate-dashboard $f -o $JSON_FILENAME 2>&1 > /dev/null -done - -cp assets/grafana/raw-json-dashboards/*-dashboard.json assets/grafana/generated/ - -for f in assets/grafana/generated/*-dashboard.json -do - basefilename=$(basename $f) - echo " $basefilename: |+" - if [ "$basefilename" = "etcd-dashboard.json" ]; then - hack/scripts/wrap-dashboard.sh $f prometheus-etcd | sed "s/^/ /g" - else - hack/scripts/wrap-dashboard.sh $f prometheus | sed "s/^/ /g" - fi -done diff --git a/hack/scripts/generate-grafana-credentials-secret.sh b/hack/scripts/generate-grafana-credentials-secret.sh deleted file mode 100755 index e877b080..00000000 --- a/hack/scripts/generate-grafana-credentials-secret.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 user password" - exit 1 -fi - -user=$1 -password=$2 - -cat <<-EOF -apiVersion: v1 -kind: Secret -metadata: - name: grafana-credentials -data: - user: $(echo -n ${user} | base64 --wrap=0) - password: $(echo -n ${password} | base64 --wrap=0) -EOF - diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh deleted file mode 100755 index 6f14056b..00000000 --- a/hack/scripts/generate-manifests.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -set -e -set +x - -# Generate Alert Rules ConfigMap -hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml - -# Generate Dashboard ConfigMap -hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboard-definitions.yaml - -# Generate Dashboard ConfigMap with configmap-generator tool -# Max Size per ConfigMap: 240000 -# Input dir: assets/grafana -# output file: manifests/grafana/grafana-dashboards.yaml -# grafana deployment output file: manifests/grafana/grafana-deployment.yaml -test -f manifests/grafana/grafana-dashboard-definitions.yaml && rm -f manifests/grafana/grafana-dashboard-definitions.yaml -test -f manifests/grafana/grafana-deployment.yaml && rm -f manifests/grafana/grafana-deployment.yaml -test -f manifests/grafana/grafana-dashboards.yaml && rm -f manifests/grafana/grafana-dashboards.yaml -hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh -s 240000 -i assets/grafana/generated -o manifests/grafana/grafana-dashboard-definitions.yaml -g manifests/grafana/grafana-deployment.yaml -d manifests/grafana/grafana-dashboards.yaml - -# Generate Grafana Credentials Secret -hack/scripts/generate-grafana-credentials-secret.sh admin admin > manifests/grafana/grafana-credentials.yaml - -# Generate Secret for Alertmanager config -hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml - diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh deleted file mode 100755 index 96c5433f..00000000 --- a/hack/scripts/generate-rules-configmap.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -cat <<-EOF -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-k8s-rules - labels: - role: alert-rules - prometheus: k8s -data: -EOF - -for f in assets/prometheus/rules/*.rules.y*ml -do - echo " $(basename "$f"): |+" - cat $f | sed "s/^/ /g" -done diff --git a/hack/scripts/wrap-dashboard.sh b/hack/scripts/wrap-dashboard.sh deleted file mode 100755 index d3b04085..00000000 --- a/hack/scripts/wrap-dashboard.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -eu - -# Intended usage: -# * Edit dashboard in Grafana (you need to login first with admin/admin -# login/password). -# * Save dashboard in Grafana to check is specification is correct. -# Looks like this is the only way to check if dashboard specification -# has errors. -# * Download dashboard specification as JSON file in Grafana: -# Share -> Export -> Save to file. -# * Drop dashboard specification in assets folder: -# mv Nodes-1488465802729.json assets/grafana/node-dashboard.json -# * Regenerate Grafana configmap: -# ./hack/scripts/generate-manifests.sh -# * Apply new configmap: -# kubectl -n monitoring apply -f manifests/grafana/grafana-cm.yaml - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 path-to-dashboard.json grafana-prometheus-datasource-name" - exit 1 -fi - -dashboardjson=$1 -datasource_name=$2 -inputname="DS_PROMETHEUS" - -if [ "$datasource_name" = "prometheus-etcd" ]; then - inputname="DS_PROMETHEUS-ETCD" -fi - -cat < 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Crashlooping Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Node Not Ready", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Node Disk Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Node Memory Pressure", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 9, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Node Unschedulable", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Health", + "version": 0 + } + kubernetes-cluster-status-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{job=\u007e\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Control Plane UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "Controller Managers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "Schedulers Up", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts{namespace=\u007e\"kube-system|tectonic-system\"}[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Crashlooping Control Plane Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "CPU Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 9, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 10, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Filesystem Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 11, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Status", + "version": 0 + } + kubernetes-kubelet-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kubelet_running_pod_count{instance=\u007e\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Count", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "kubelet_running_pod_count{instance=\u007e\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Pods", + "titleSize": "h4", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kubelet_running_container_count{instance=\u007e\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Count", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "kubelet_running_container_count{instance=\u007e\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Containers", + "titleSize": "h4", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Rate of Kubelet Operations in 5min", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations{instance=\u007e\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Kubelet", + "titleSize": "h4", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(kubelet_running_pod_count,instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubelet", + "version": 0 + } + nodes.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "format": "time_series", + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Idle CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 1m", + "refId": "A" + }, + { + "expr": "node_load5{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 5m", + "refId": "B" + }, + { + "expr": "node_load15{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 15m", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "written", + "refId": "B" + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "io time", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Disk Space Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [ + + ], + "query": "label_values(node_boot_time, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 0 + } + pods-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current: {{ container_name }}", + "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [ + + ], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 0 + } kind: ConfigMap metadata: - name: grafana-dashboard-definitions-0 -data: - deployment-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "200px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "100px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_deployment_spec_replicas", - "refId": "A", - "step": 600 - } - ], - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C", - "step": 30 - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D", - "step": 30 - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", - "options": [], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Deployment", - "multi": false, - "name": "deployment_name", - "options": [], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "deployment", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Deployment", - "version": 1 - } - etcd-dashboard.json: |+ - { - "__inputs": [ - { - "name": "prometheus", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.5.2" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "etcd sample Grafana dashboard with Prometheus", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 28, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(etcd_server_has_leader)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "etcd_server_has_leader", - "refId": "A", - "step": 20 - } - ], - "thresholds": "", - "title": "Up", - "type": "singlestat", - "valueFontSize": "200%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Rate", - "metric": "grpc_server_started_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Failed Rate", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 41, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Watch Streams", - "metric": "grpc_server_handled_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Lease Streams", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Active Streams", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB Size", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "DB Size", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 1, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} WAL fsync", - "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB fsync", - "metric": "etcd_disk_backend_commit_duration_seconds_bucket", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Sync Duration", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 29, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Resident Memory", - "metric": "process_resident_memory_bytes", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic In", - "metric": "etcd_network_client_grpc_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic Out", - "metric": "etcd_network_client_grpc_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic In", - "metric": "etcd_network_peer_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic Out", - "metric": "etcd_network_peer_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 40, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Failure Rate", - "metric": "etcd_server_proposals_failed_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(etcd_server_proposals_pending)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Pending Total", - "metric": "etcd_server_proposals_pending", - "refId": "B", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Commit Rate", - "metric": "etcd_server_proposals_committed_total", - "refId": "C", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Apply Rate", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Raft Proposals", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": 0, - "editable": false, - "error": false, - "fill": 0, - "id": 19, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "changes(etcd_server_leader_changes_seen_total[1d])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Total Leader Elections Per Day", - "metric": "etcd_server_leader_changes_seen_total", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total Leader Elections Per Day", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "etcd", - "version": 4 - } - kubernetes-capacity-planning-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "hide": false, - "intervalFactor": 10, - "legendFormat": "", - "refId": "A", - "step": 50 - } - ], - "title": "Idle CPU", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 9, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load5)", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "sum(node_load15)", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "title": "System Load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 4, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "intervalFactor": 2, - "legendFormat": "memory usage", - "metric": "memo", - "refId": "A", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Buffers)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "memo", - "refId": "B", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_Cached)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "memo", - "refId": "C", - "step": 10, - "target": "" - }, - { - "expr": "sum(node_memory_MemFree)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "memo", - "refId": "D", - "step": 10, - "target": "" - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "intervalFactor": 2, - "metric": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "246px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "ms", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 12, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk Space Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 8, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10, - "target": "" - } - ], - "title": "Network Received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 10, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10, - "target": "" - } - ], - "title": "Network Transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "276px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 11, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 11, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_info)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current number of Pods", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_node_status_capacity_pods)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Maximum capacity of pods", - "refId": "B", - "step": 10 - } - ], - "title": "Cluster Pod Utilization", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Capacity Planning", - "version": 4 - } - kubernetes-cluster-health-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "editable": false, - "height": "254px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Control Plane Components Down", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "Everything UP and healthy", - "value": "null" - }, - { - "op": "=", - "text": "", - "value": "" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Alerts Firing", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "3, 5", - "title": "Alerts Pending", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Crashlooping Pods", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Not Ready", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Disk Pressure", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Node Memory Pressure", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(kube_node_spec_unschedulable)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Nodes Unschedulable", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Health", - "version": 9 - } - kubernetes-cluster-status-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "129px", - "panels": [ - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Control Plane UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "UP", - "value": "null" - } - ], - "valueName": "total" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "3, 5", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": true, - "title": "Cluster Health", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "168px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Controller Managers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Schedulers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "1, 3", - "title": "Crashlooping Control Plane Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": true, - "title": "Control Plane Status", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "158px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "CPU Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Memory Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Filesystem Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 10, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": true, - "title": "Capacity Planning", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Status", - "version": 3 - } - kubernetes-control-plane-status-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 1, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Controller Managers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "thresholds": "50, 80", - "title": "Schedulers UP", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 600 - } - ], - "thresholds": "5, 10", - "title": "API Server Request Error Rate", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 7, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 30 - } - ], - "title": "API Server Request Latency", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 5, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 60 - } - ], - "title": "End to End Scheduling Latency", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "dtdurations", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Error Rate", - "refId": "A", - "step": 60 - }, - { - "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Request Rate", - "refId": "B", - "step": 60 - } - ], - "title": "API Server Request Rates", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Control Plane Status", - "version": 3 - } - kubernetes-resource-requests-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "300px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B", - "step": 20 - } - ], - "title": "CPU Cores", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "CPU Cores", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "transparent": false, - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "CPU Cores", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "300px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A", - "step": 20 - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B", - "step": 20 - } - ], - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Memory", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 4, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 240 - } - ], - "thresholds": "80, 90", - "title": "Memory", - "transparent": false, - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Memory", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Resource Requests", - "version": 2 - } - nodes-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "description": "Dashboard to get an overview of one server", - "editable": false, - "gnetId": 22, - "graphTooltip": 0, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", - "hide": false, - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A", - "step": 50 - } - ], - "title": "Idle CPU", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": "cpu usage", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 9, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 1m", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "node_load5{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 5m", - "refId": "B", - "step": 20, - "target": "" - }, - { - "expr": "node_load15{instance=\"$server\"}", - "intervalFactor": 4, - "legendFormat": "load 15m", - "refId": "C", - "step": 20, - "target": "" - } - ], - "title": "System Load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 4, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory used", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "metric": "", - "refId": "E", - "step": 10 - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory cached", - "metric": "", - "refId": "F", - "step": 10 - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "intervalFactor": 2, - "legendFormat": "memory free", - "metric": "", - "refId": "D", - "step": 10 - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 6, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "{instance=\"172.17.0.1:9100\"}", - "yaxis": 2 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", - "hide": false, - "intervalFactor": 4, - "legendFormat": "read", - "refId": "A", - "step": 20, - "target": "" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "written", - "refId": "B", - "step": 20 - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "intervalFactor": 4, - "legendFormat": "io time", - "refId": "C", - "step": 20 - } - ], - "title": "Disk I/O", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "ms", - "logBase": 1, - "show": true - } - ] - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "editable": false, - "format": "percentunit", - "gauge": { - "maxValue": 1, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "hideTimeOverride": false, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", - "intervalFactor": 2, - "refId": "A", - "step": 60, - "target": "" - } - ], - "thresholds": "0.75, 0.9", - "title": "Disk Space Usage", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 8, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A", - "step": 10, - "target": "" - } - ], - "title": "Network Received", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 10, - "isNew": false, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "transmitted", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "title": "Network Transmitted", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "server", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 2 - } - pods-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Current: {{ container_name }}", - "metric": "container_memory_usage_bytes", - "refId": "A", - "step": 15 - }, - { - "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_memory_bytes", - "refId": "B", - "step": 20 - }, - { - "expr": "kube_pod_container_resource_limits_memory_bytes{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "metric": "kube_pod_container_resource_limits_memory_bytes", - "refId": "C", - "step": 20 - } - ], - "title": "Memory Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A", - "step": 30 - }, - { - "expr": "kube_pod_container_resource_requests_cpu_cores{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "metric": "kube_pod_container_resource_requests_cpu_cores", - "refId": "B", - "step": 20 - }, - { - "expr": "kube_pod_container_resource_limits_cpu_cores{pod=\"$pod\", container=~\"$container\"}", - "interval": "10s", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "metric": "kube_pod_container_resource_limits_memory_bytes", - "refId": "C", - "step": 20 - } - ], - "title": "CPU Usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "isNew": false, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A", - "step": 30 - } - ], - "title": "Network I/O", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "New Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 1 - } - statefulset-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "editable": false, - "graphTooltip": 1, - "hideControls": false, - "links": [], - "rows": [ - { - "collapse": false, - "editable": false, - "height": "200px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 8, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "CPU", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 9, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "80%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}) / 1024^3", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Memory", - "type": "singlestat", - "valueFontSize": "110%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "Bps", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 7, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "100px", - "panels": [ - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": false - }, - "id": 5, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "metric": "kube_statefulset_replicas", - "refId": "A", - "step": 600 - } - ], - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 6, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 3, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 2, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "refId": "A", - "step": 600 - } - ], - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "editable": false, - "height": "350px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 1, - "grid": { - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 1, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B", - "step": 30 - }, - { - "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E", - "step": 30 - } - ], - "title": "Replicas", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "", - "logBase": 1, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "show": false - } - ] - } - ], - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": ".*", - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "statefulset_namespace", - "options": [], - "query": "label_values(kube_statefulset_metadata_generation, namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "StatefulSet", - "multi": false, - "name": "statefulset_name", - "options": [], - "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "statefulset", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "StatefulSet", - "version": 1 - } ---- + name: grafana-dashboard-definitions + namespace: monitoring diff --git a/manifests/grafana/grafana-dashboard-sources.yaml b/manifests/grafana/grafana-dashboard-sources.yaml new file mode 100644 index 00000000..61fdcf61 --- /dev/null +++ b/manifests/grafana/grafana-dashboard-sources.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +data: + dashboards.yaml: |- + [ + { + "folder": "", + "name": "0", + "options": { + "path": "/grafana-dashboard-definitions/0" + }, + "org_id": 1, + "type": "file" + } + ] +kind: ConfigMap +metadata: + name: grafana-dashboards + namespace: monitoring diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml deleted file mode 100644 index 772d3f64..00000000 --- a/manifests/grafana/grafana-dashboards.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards -data: - dashboards.yaml: |+ - - name: '0' - org_id: 1 - folder: '' - type: file - options: - folder: /grafana-dashboard-definitions/0 diff --git a/manifests/grafana/grafana-datasources.yaml b/manifests/grafana/grafana-datasources.yaml index 33c3081f..5ed25a02 100644 --- a/manifests/grafana/grafana-datasources.yaml +++ b/manifests/grafana/grafana-datasources.yaml @@ -1,15 +1,20 @@ apiVersion: v1 +data: + prometheus.yaml: |- + { + "datasources": [ + { + "access": "proxy", + "etitable": false, + "name": "prometheus", + "org_id": 1, + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090", + "version": 1 + } + ] + } kind: ConfigMap metadata: name: grafana-datasources -data: - prometheus.yaml: |+ - datasources: - - name: prometheus - type: prometheus - access: proxy - org_id: 1 - url: http://prometheus-k8s.monitoring.svc:9090 - version: 1 - editable: false - + namespace: monitoring diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index 9eb8750f..9d7ae88f 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -1,48 +1,59 @@ -apiVersion: apps/v1beta1 +apiVersion: apps/v1beta2 kind: Deployment metadata: + labels: + app: grafana name: grafana + namespace: monitoring spec: replicas: 1 + selector: + matchLabels: + app: grafana template: metadata: labels: app: grafana spec: + containers: + - image: quay.io/coreos/monitoring-grafana:5.0.3 + name: grafana + ports: + - containerPort: 3000 + name: http + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - mountPath: /data + name: grafana-storage + readOnly: false + - mountPath: /grafana/conf/provisioning/datasources + name: grafana-datasources + readOnly: false + - mountPath: /grafana/conf/provisioning/dashboards + name: grafana-dashboards + readOnly: false + - mountPath: /grafana-dashboard-definitions/0 + name: grafana-dashboard-definitions + readOnly: false securityContext: runAsNonRoot: true runAsUser: 65534 - containers: - - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.3 - volumeMounts: - - name: grafana-storage - mountPath: /data - - name: grafana-datasources - mountPath: /grafana/conf/provisioning/datasources - - name: grafana-dashboards - mountPath: /grafana/conf/provisioning/dashboards - - name: grafana-dashboard-definitions-0 - mountPath: /grafana-dashboard-definitions/0 - ports: - - name: web - containerPort: 3000 - resources: - requests: - memory: 100Mi - cpu: 100m - limits: - memory: 200Mi - cpu: 200m + serviceAccountName: grafana volumes: - - name: grafana-storage - emptyDir: {} - - name: grafana-datasources - configMap: + - emptyDir: {} + name: grafana-storage + - configMap: name: grafana-datasources - - name: grafana-dashboards - configMap: + name: grafana-datasources + - configMap: name: grafana-dashboards - - name: grafana-dashboard-definitions-0 - configMap: - name: grafana-dashboard-definitions-0 + name: grafana-dashboards + - configMap: + name: grafana-dashboard-definitions + name: grafana-dashboard-definitions diff --git a/manifests/grafana/grafana-service-account.yaml b/manifests/grafana/grafana-service-account.yaml new file mode 100644 index 00000000..3ed3e031 --- /dev/null +++ b/manifests/grafana/grafana-service-account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana + namespace: monitoring diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana/grafana-service.yaml index fbcee40d..45f77a0d 100644 --- a/manifests/grafana/grafana-service.yaml +++ b/manifests/grafana/grafana-service.yaml @@ -2,14 +2,11 @@ apiVersion: v1 kind: Service metadata: name: grafana - labels: - app: grafana + namespace: monitoring spec: - type: NodePort ports: - - port: 3000 - protocol: TCP - nodePort: 30902 - targetPort: web + - name: http + port: 3000 + targetPort: http selector: app: grafana diff --git a/manifests/k8s/kubeadm/kube-controller-manager.yaml b/manifests/k8s/kubeadm/kube-controller-manager.yaml deleted file mode 100644 index bd8d7cb5..00000000 --- a/manifests/k8s/kubeadm/kube-controller-manager.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-controller-manager-prometheus-discovery - labels: - k8s-app: kube-controller-manager -spec: - selector: - component: kube-controller-manager - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10252 - targetPort: 10252 - protocol: TCP diff --git a/manifests/k8s/kubeadm/kube-scheduler.yaml b/manifests/k8s/kubeadm/kube-scheduler.yaml deleted file mode 100644 index 2d90097a..00000000 --- a/manifests/k8s/kubeadm/kube-scheduler.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-scheduler-prometheus-discovery - labels: - k8s-app: kube-scheduler -spec: - selector: - component: kube-scheduler - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10251 - targetPort: 10251 - protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-controller-manager.yaml b/manifests/k8s/self-hosted/kube-controller-manager.yaml deleted file mode 100644 index a2983101..00000000 --- a/manifests/k8s/self-hosted/kube-controller-manager.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-controller-manager-prometheus-discovery - labels: - k8s-app: kube-controller-manager -spec: - selector: - k8s-app: kube-controller-manager - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10252 - targetPort: 10252 - protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-dns.yaml b/manifests/k8s/self-hosted/kube-dns.yaml deleted file mode 100644 index e0327714..00000000 --- a/manifests/k8s/self-hosted/kube-dns.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-dns-prometheus-discovery - labels: - k8s-app: kube-dns -spec: - selector: - k8s-app: kube-dns - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics-skydns - port: 10055 - targetPort: 10055 - protocol: TCP - - name: http-metrics-dnsmasq - port: 10054 - targetPort: 10054 - protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-scheduler.yaml b/manifests/k8s/self-hosted/kube-scheduler.yaml deleted file mode 100644 index 0fe05dd7..00000000 --- a/manifests/k8s/self-hosted/kube-scheduler.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - namespace: kube-system - name: kube-scheduler-prometheus-discovery - labels: - k8s-app: kube-scheduler -spec: - selector: - k8s-app: kube-scheduler - type: ClusterIP - clusterIP: None - ports: - - name: http-metrics - port: 10251 - targetPort: 10251 - protocol: TCP diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml index 8284fc15..9a8f3111 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: kube-state-metrics diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml index ef5e91ac..cae18483 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -1,10 +1,13 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: kube-state-metrics rules: -- apiGroups: [""] +- apiGroups: + - "" resources: + - configmaps + - secrets - nodes - pods - services @@ -15,31 +18,49 @@ rules: - persistentvolumes - namespaces - endpoints - verbs: ["list", "watch"] -- apiGroups: ["extensions"] + verbs: + - list + - watch +- apiGroups: + - extensions resources: - daemonsets - deployments - replicasets - verbs: ["list", "watch"] -- apiGroups: ["apps"] + verbs: + - list + - watch +- apiGroups: + - apps resources: - statefulsets - verbs: ["list", "watch"] -- apiGroups: ["batch"] + verbs: + - list + - watch +- apiGroups: + - batch resources: - cronjobs - jobs - verbs: ["list", "watch"] -- apiGroups: ["autoscaling"] + verbs: + - list + - watch +- apiGroups: + - autoscaling resources: - horizontalpodautoscalers - verbs: ["list", "watch"] -- apiGroups: ["authentication.k8s.io"] + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io resources: - tokenreviews - verbs: ["create"] -- apiGroups: ["authorization.k8s.io"] + verbs: + - create +- apiGroups: + - authorization.k8s.io resources: - subjectaccessreviews - verbs: ["create"] \ No newline at end of file + verbs: + - create diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml index 61f918eb..bd6d9475 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -1,80 +1,95 @@ -apiVersion: extensions/v1beta1 +apiVersion: apps/v1beta2 kind: Deployment metadata: + labels: + app: kube-state-metrics name: kube-state-metrics + namespace: monitoring spec: replicas: 1 + selector: + matchLabels: + app: kube-state-metrics template: metadata: labels: app: kube-state-metrics spec: - serviceAccountName: kube-state-metrics + containers: + - args: + - --secure-listen-address=:8443 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + - args: + - --secure-listen-address=:9443 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + image: quay.io/coreos/kube-state-metrics:v1.3.0 + name: kube-state-metrics + resources: + limits: + cpu: 102m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + - command: + - /pod_nanny + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=2m + - --memory=150Mi + - --extra-memory=30Mi + - --threshold=5 + - --deployment=kube-state-metrics + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + image: quay.io/coreos/addon-resizer:1.0 + name: addon-resizer + resources: + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi securityContext: runAsNonRoot: true runAsUser: 65534 - containers: - - name: kube-rbac-proxy-main - image: quay.io/brancz/kube-rbac-proxy:v0.2.0 - args: - - "--secure-listen-address=:8443" - - "--upstream=http://127.0.0.1:8081/" - ports: - - name: https-main - containerPort: 8443 - resources: - requests: - memory: 20Mi - cpu: 10m - limits: - memory: 40Mi - cpu: 20m - - name: kube-rbac-proxy-self - image: quay.io/brancz/kube-rbac-proxy:v0.2.0 - args: - - "--secure-listen-address=:9443" - - "--upstream=http://127.0.0.1:8082/" - ports: - - name: https-self - containerPort: 9443 - resources: - requests: - memory: 20Mi - cpu: 10m - limits: - memory: 40Mi - cpu: 20m - - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.2.0 - args: - - "--host=127.0.0.1" - - "--port=8081" - - "--telemetry-host=127.0.0.1" - - "--telemetry-port=8082" - - name: addon-resizer - image: gcr.io/google_containers/addon-resizer:1.0 - resources: - limits: - cpu: 100m - memory: 30Mi - requests: - cpu: 100m - memory: 30Mi - env: - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - command: - - /pod_nanny - - --container=kube-state-metrics - - --cpu=100m - - --extra-cpu=2m - - --memory=150Mi - - --extra-memory=30Mi - - --threshold=5 - - --deployment=kube-state-metrics + serviceAccountName: kube-state-metrics diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml index a93c3965..dcad8055 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml @@ -1,12 +1,12 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: kube-state-metrics + namespace: monitoring roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: kube-state-metrics-resizer + name: kube-state-metrics-addon-resizer subjects: - kind: ServiceAccount name: kube-state-metrics - diff --git a/manifests/kube-state-metrics/kube-state-metrics-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-role.yaml index 6bf21fb8..0063ffb4 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-role.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-role.yaml @@ -1,15 +1,21 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: kube-state-metrics-resizer + name: kube-state-metrics + namespace: monitoring rules: -- apiGroups: [""] +- apiGroups: + - "" resources: - pods - verbs: ["get"] -- apiGroups: ["extensions"] + verbs: + - get +- apiGroups: + - extensions + resourceNames: + - kube-state-metrics resources: - deployments - resourceNames: ["kube-state-metrics"] - verbs: ["get", "update"] - + verbs: + - get + - update diff --git a/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml b/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml index 99779352..fff1028b 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml @@ -2,3 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: kube-state-metrics + namespace: monitoring diff --git a/manifests/kube-state-metrics/kube-state-metrics-service.yaml b/manifests/kube-state-metrics/kube-state-metrics-service.yaml index b4422685..3e88b562 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-service.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-service.yaml @@ -2,20 +2,16 @@ apiVersion: v1 kind: Service metadata: labels: - app: kube-state-metrics k8s-app: kube-state-metrics name: kube-state-metrics + namespace: monitoring spec: - clusterIP: None ports: - name: https-main port: 8443 targetPort: https-main - protocol: TCP - name: https-self port: 9443 targetPort: https-self - protocol: TCP selector: app: kube-state-metrics - diff --git a/manifests/node-exporter/node-exporter-cluster-role.yaml b/manifests/node-exporter/node-exporter-cluster-role.yaml index 932b7762..ad783ae9 100644 --- a/manifests/node-exporter/node-exporter-cluster-role.yaml +++ b/manifests/node-exporter/node-exporter-cluster-role.yaml @@ -3,11 +3,15 @@ kind: ClusterRole metadata: name: node-exporter rules: -- apiGroups: ["authentication.k8s.io"] +- apiGroups: + - authentication.k8s.io resources: - tokenreviews - verbs: ["create"] -- apiGroups: ["authorization.k8s.io"] + verbs: + - create +- apiGroups: + - authorization.k8s.io resources: - subjectaccessreviews - verbs: ["create"] + verbs: + - create diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml index f92113e8..1284e93d 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -1,69 +1,63 @@ -apiVersion: extensions/v1beta1 +apiVersion: apps/v1beta2 kind: DaemonSet metadata: + labels: + app: node-exporter name: node-exporter + namespace: monitoring spec: - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate + selector: + matchLabels: + app: node-exporter template: metadata: labels: app: node-exporter - name: node-exporter spec: - serviceAccountName: node-exporter + containers: + - args: + - --web.listen-address=127.0.0.1:9101 + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + image: quay.io/prometheus/node-exporter:v0.15.2 + name: node-exporter + resources: + limits: + cpu: 102m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: false + - mountPath: /host/sys + name: sys + readOnly: false + - args: + - --secure-listen-address=:9100 + - --upstream=http://127.0.0.1:9101/ + image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + name: kube-rbac-proxy + ports: + - containerPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: runAsNonRoot: true runAsUser: 65534 - hostNetwork: true - hostPID: true - containers: - - image: quay.io/prometheus/node-exporter:v0.15.2 - args: - - "--web.listen-address=127.0.0.1:9101" - - "--path.procfs=/host/proc" - - "--path.sysfs=/host/sys" - name: node-exporter - resources: - requests: - memory: 30Mi - cpu: 100m - limits: - memory: 50Mi - cpu: 200m - volumeMounts: - - name: proc - readOnly: true - mountPath: /host/proc - - name: sys - readOnly: true - mountPath: /host/sys - - name: kube-rbac-proxy - image: quay.io/brancz/kube-rbac-proxy:v0.2.0 - args: - - "--secure-listen-address=:9100" - - "--upstream=http://127.0.0.1:9101/" - ports: - - containerPort: 9100 - hostPort: 9100 - name: https - resources: - requests: - memory: 20Mi - cpu: 10m - limits: - memory: 40Mi - cpu: 20m - tolerations: - - effect: NoSchedule - operator: Exists + serviceAccountName: node-exporter volumes: - - name: proc - hostPath: + - hostPath: path: /proc - - name: sys - hostPath: + name: proc + - hostPath: path: /sys - + name: sys diff --git a/manifests/node-exporter/node-exporter-service-account.yaml b/manifests/node-exporter/node-exporter-service-account.yaml index 703a2748..8a03ac16 100644 --- a/manifests/node-exporter/node-exporter-service-account.yaml +++ b/manifests/node-exporter/node-exporter-service-account.yaml @@ -2,3 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: node-exporter + namespace: monitoring diff --git a/manifests/node-exporter/node-exporter-service.yaml b/manifests/node-exporter/node-exporter-service.yaml index 8aa37747..101a9769 100644 --- a/manifests/node-exporter/node-exporter-service.yaml +++ b/manifests/node-exporter/node-exporter-service.yaml @@ -2,16 +2,13 @@ apiVersion: v1 kind: Service metadata: labels: - app: node-exporter k8s-app: node-exporter name: node-exporter + namespace: monitoring spec: - type: ClusterIP - clusterIP: None ports: - name: https port: 9100 - protocol: TCP + targetPort: https selector: app: node-exporter - diff --git a/manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml b/manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml new file mode 100644 index 00000000..554bb6f8 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml b/manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml new file mode 100644 index 00000000..d5c45983 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- apiGroups: + - "" + resources: + - nodes/metrics + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml new file mode 100644 index 00000000..f21ff3c5 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s-config + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s-config +subjects: +- kind: ServiceAccount + name: prometheus-k8s-config + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml new file mode 100644 index 00000000..c4039710 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml new file mode 100644 index 00000000..250c7307 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml new file mode 100644 index 00000000..068c77d3 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-config.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-config.yaml new file mode 100644 index 00000000..5f1cd043 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-config.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s-config + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-default.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-default.yaml new file mode 100644 index 00000000..1c336117 --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-default.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: default +rules: +- apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml new file mode 100644 index 00000000..d82fe3ab --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: kube-system +rules: +- apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml new file mode 100644 index 00000000..343cfc6d --- /dev/null +++ b/manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml similarity index 83% rename from manifests/prometheus/prometheus-k8s-rules.yaml rename to manifests/prometheus-k8s/prometheus-k8s-rules.yaml index c45bbbbc..90cb3f3e 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml @@ -1,12 +1,6 @@ apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-k8s-rules - labels: - role: alert-rules - prometheus: k8s data: - alertmanager.rules.yaml: |+ + alertmanager.rules.yaml: | groups: - name: alertmanager.rules rules: @@ -40,7 +34,7 @@ data: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Alertmanager's configuration reload failed - etcd3.rules.yaml: |+ + etcd3.rules.yaml: | groups: - name: ./etcd3.rules rules: @@ -164,7 +158,7 @@ data: annotations: description: etcd instance {{ $labels.instance }} commit durations are high summary: high commit durations - general.rules.yaml: |+ + general.rules.yaml: | groups: - name: general.rules rules: @@ -204,7 +198,7 @@ data: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted - kube-controller-manager.rules.yaml: |+ + kube-controller-manager.rules.yaml: | groups: - name: kube-controller-manager.rules rules: @@ -218,7 +212,7 @@ data: controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager summary: Controller manager is down - kube-scheduler.rules.yaml: |+ + kube-scheduler.rules.yaml: | groups: - name: kube-scheduler.rules rules: @@ -277,7 +271,7 @@ data: to nodes. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler summary: Scheduler is down - kube-state-metrics.rules.yaml: |+ + kube-state-metrics.rules.yaml: | groups: - name: kube-state-metrics.rules rules: @@ -337,7 +331,7 @@ data: description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} times within the last hour summary: Pod is restarting frequently - kubelet.rules.yaml: |+ + kubelet.rules.yaml: | groups: - name: kubelet.rules rules: @@ -386,7 +380,7 @@ data: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - kubernetes.rules.yaml: |+ + kubernetes.rules.yaml: | groups: - name: kubernetes.rules rules: @@ -477,7 +471,7 @@ data: description: No API servers are reachable or all have disappeared from service discovery summary: No API servers are reachable - + - alert: K8sCertificateExpirationNotice labels: severity: warning @@ -485,7 +479,7 @@ data: description: Kubernetes API Certificate is expiring soon (less than 7 days) summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - + - alert: K8sCertificateExpirationNotice labels: severity: critical @@ -493,7 +487,7 @@ data: description: Kubernetes API Certificate is expiring in less than 1 day summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 - node.rules.yaml: |+ + node.rules.yaml: | groups: - name: node.rules rules: @@ -541,105 +535,53 @@ data: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours - prometheus.rules.yaml: |+ - groups: - - name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - expr: prometheus_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - summary: Reloading Promehteus' configuration failed - - - alert: PrometheusNotificationQueueRunningFull - expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity - for: 10m - labels: - severity: warning - annotations: - description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ - $labels.pod}} - summary: Prometheus' alert notification queue is running full - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alert from Prometheus - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.03 - for: 10m - labels: - severity: critical - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alerts from Prometheus - - - alert: PrometheusNotConnectedToAlertmanagers - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 10m - labels: - severity: warning - annotations: - description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected - to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - - - alert: PrometheusTSDBReloadsFailing - expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - - - alert: PrometheusTSDBCompactionsFailing - expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - - - alert: PrometheusTSDBWALCorruptions - expr: tsdb_wal_corruptions_total > 0 - for: 4h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead - log (WAL).' - summary: Prometheus write-ahead log is corrupted - - - alert: PrometheusNotIngestingSamples - expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 - for: 10m - labels: - severity: warning - annotations: - description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." - summary: "Prometheus isn't ingesting samples" - - - alert: PrometheusTargetScapesDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 10m - labels: - severity: warning - annotations: - description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" - summary: Prometheus has many samples rejected + prometheus.rules.yaml: "groups:\n- name: prometheus.rules\n rules:\n - alert: + PrometheusConfigReloadFailed\n expr: prometheus_config_last_reload_successful + == 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: + Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}\n + \ summary: Reloading Promehteus' configuration failed\n\n - alert: PrometheusNotificationQueueRunningFull\n + \ expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > + prometheus_notifications_queue_capacity\n for: 10m\n labels:\n severity: + warning\n annotations:\n description: Prometheus' alert notification queue + is running full for {{$labels.namespace}}/{{\n $labels.pod}}\n summary: + Prometheus' alert notification queue is running full \n\n - alert: PrometheusErrorSendingAlerts\n + \ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n + \ > 0.01\n for: 10m\n labels:\n severity: warning\n annotations:\n + \ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n + \ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary: + Errors while sending alert from Prometheus\n\n - alert: PrometheusErrorSendingAlerts\n + \ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n + \ > 0.03\n for: 10m\n labels:\n severity: critical\n annotations:\n + \ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n + \ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary: + Errors while sending alerts from Prometheus\n\n - alert: PrometheusNotConnectedToAlertmanagers\n + \ expr: prometheus_notifications_alertmanagers_discovered < 1\n for: 10m\n + \ labels:\n severity: warning\n annotations:\n description: Prometheus + {{ $labels.namespace }}/{{ $labels.pod}} is not connected\n to any Alertmanagers\n + \ summary: Prometheus is not connected to any Alertmanagers\n\n - alert: + PrometheusTSDBReloadsFailing\n expr: increase(prometheus_tsdb_reloads_failures_total[2h]) + > 0\n for: 12h\n labels:\n severity: warning\n annotations:\n description: + '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}\n reload + failures over the last four hours.'\n summary: Prometheus has issues reloading + data blocks from disk\n\n - alert: PrometheusTSDBCompactionsFailing\n expr: + increase(prometheus_tsdb_compactions_failed_total[2h]) > 0\n for: 12h\n labels:\n + \ severity: warning\n annotations:\n description: '{{$labels.job}} + at {{$labels.instance}} had {{$value | humanize}}\n compaction failures + over the last four hours.'\n summary: Prometheus has issues compacting sample + blocks\n\n - alert: PrometheusTSDBWALCorruptions\n expr: tsdb_wal_corruptions_total + > 0\n for: 4h\n labels:\n severity: warning\n annotations:\n description: + '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead\n log + (WAL).'\n summary: Prometheus write-ahead log is corrupted\n\n - alert: + PrometheusNotIngestingSamples\n expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) + <= 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: + \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.\"\n + \ summary: \"Prometheus isn't ingesting samples\"\n\n - alert: PrometheusTargetScapesDuplicate\n + \ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) + > 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: + \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate + timestamps but different values\"\n summary: Prometheus has many samples + rejected\n" +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-service-account.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-account.yaml similarity index 74% rename from manifests/prometheus/prometheus-k8s-service-account.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-account.yaml index 58d5342d..3e55fad6 100644 --- a/manifests/prometheus/prometheus-k8s-service-account.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-account.yaml @@ -2,3 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml similarity index 81% rename from manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml index 19669e3e..e4e75ccc 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml @@ -1,16 +1,17 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: alertmanager labels: k8s-app: alertmanager + name: alertmanager + namespace: monitoring spec: - selector: - matchLabels: - alertmanager: main + endpoints: + - interval: 30s + port: web namespaceSelector: matchNames: - monitoring - endpoints: - - port: web - interval: 30s + selector: + matchLabels: + alertmanager: main diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml similarity index 81% rename from manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml index 40361f04..0cffe541 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml @@ -1,23 +1,24 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: kube-apiserver labels: k8s-app: apiserver + name: kube-apiserver + namespace: monitoring spec: - jobLabel: component - selector: - matchLabels: - component: apiserver - provider: kubernetes - namespaceSelector: - matchNames: - - default endpoints: - - port: https + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 30s + port: https scheme: https tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt serverName: kubernetes - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml similarity index 69% rename from manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml index 362ac899..12a4c5bf 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-coredns.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml @@ -4,16 +4,17 @@ metadata: labels: k8s-app: coredns name: coredns + namespace: monitoring spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: http-metrics jobLabel: k8s-app - selector: - matchLabels: - k8s-app: coredns - component: metrics namespaceSelector: matchNames: - - kube-system - endpoints: - - port: http-metrics - interval: 15s - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - kube-system + selector: + matchLabels: + component: metrics + k8s-app: coredns diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml similarity index 82% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml index 681c320d..dfb2a25d 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -1,17 +1,18 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: kube-controller-manager labels: k8s-app: kube-controller-manager + name: kube-controller-manager + namespace: monitoring spec: - jobLabel: k8s-app endpoints: - - port: http-metrics - interval: 30s - selector: - matchLabels: - k8s-app: kube-controller-manager + - interval: 30s + port: http-metrics + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system + selector: + matchLabels: + k8s-app: kube-controller-manager diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml similarity index 81% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml index 6927f58e..f00db0e4 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -1,17 +1,18 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: kube-scheduler labels: k8s-app: kube-scheduler + name: kube-scheduler + namespace: monitoring spec: - jobLabel: k8s-app endpoints: - - port: http-metrics - interval: 30s - selector: - matchLabels: - k8s-app: kube-scheduler + - interval: 30s + port: http-metrics + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system + selector: + matchLabels: + k8s-app: kube-scheduler diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml similarity index 71% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml index 1433a5fe..cca52f69 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -1,28 +1,29 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: kube-state-metrics labels: k8s-app: kube-state-metrics + name: kube-state-metrics + namespace: monitoring spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + port: https-main + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true jobLabel: k8s-app - selector: - matchLabels: - k8s-app: kube-state-metrics namespaceSelector: matchNames: - monitoring - endpoints: - - port: https-main - scheme: https - interval: 30s - honorLabels: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true - - port: https-self - scheme: https - interval: 30s - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true + selector: + matchLabels: + k8s-app: kube-state-metrics diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml similarity index 71% rename from manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml index 16c9752d..06ec7fc8 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml @@ -1,29 +1,30 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: kubelet labels: k8s-app: kubelet + name: kubelet + namespace: monitoring spec: - jobLabel: k8s-app endpoints: - - port: https-metrics - scheme: https + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 30s + port: https-metrics + scheme: https tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - - port: https-metrics - scheme: https - path: /metrics/cadvisor - interval: 30s + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true + interval: 30s + path: /metrics/cadvisor + port: https-metrics + scheme: https tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - selector: - matchLabels: - k8s-app: kubelet + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system + selector: + matchLabels: + k8s-app: kubelet diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml similarity index 78% rename from manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml index 0dd72e75..529f2944 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml @@ -1,21 +1,22 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: node-exporter labels: k8s-app: node-exporter + name: node-exporter + namespace: monitoring spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true jobLabel: k8s-app - selector: - matchLabels: - k8s-app: node-exporter namespaceSelector: matchNames: - monitoring - endpoints: - - port: https - scheme: https - interval: 30s - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true + selector: + matchLabels: + k8s-app: node-exporter diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml similarity index 90% rename from manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml index 0b8028e7..10e0059a 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -1,9 +1,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: prometheus-operator labels: k8s-app: prometheus-operator + name: prometheus-operator + namespace: monitoring spec: endpoints: - port: http diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml similarity index 81% rename from manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml index c3d11e57..90b25476 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml @@ -1,16 +1,17 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: prometheus labels: k8s-app: prometheus + name: prometheus + namespace: monitoring spec: - selector: - matchLabels: - prometheus: k8s + endpoints: + - interval: 30s + port: web namespaceSelector: matchNames: - monitoring - endpoints: - - port: web - interval: 30s + selector: + matchLabels: + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests/prometheus-k8s/prometheus-k8s-service.yaml similarity index 77% rename from manifests/prometheus/prometheus-k8s-service.yaml rename to manifests/prometheus-k8s/prometheus-k8s-service.yaml index 5cd3b65b..85b007f8 100644 --- a/manifests/prometheus/prometheus-k8s-service.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-service.yaml @@ -4,13 +4,12 @@ metadata: labels: prometheus: k8s name: prometheus-k8s + namespace: monitoring spec: - type: NodePort ports: - name: web - nodePort: 30900 port: 9090 - protocol: TCP targetPort: web selector: + app: prometheus prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus-k8s/prometheus-k8s.yaml similarity index 54% rename from manifests/prometheus/prometheus-k8s.yaml rename to manifests/prometheus-k8s/prometheus-k8s.yaml index 8f243eb0..324d96c7 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s.yaml @@ -1,29 +1,27 @@ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: - name: k8s labels: prometheus: k8s + name: k8s + namespace: monitoring spec: + alerting: + alertmanagers: + - name: alertmanager-main + namespace: monitoring + port: web replicas: 2 - version: v2.2.1 + resources: + requests: + memory: 400Mi + ruleSelector: + matchLabels: + prometheus: k8s + role: alert-rules serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: - - {key: k8s-app, operator: Exists} - ruleSelector: - matchLabels: - role: alert-rules - prometheus: k8s - resources: - requests: - # 2Gi is default, but won't schedule if you don't have a node with >2Gi - # memory. Modify based on your target and time-series count for - # production use. This value is mainly meant for demonstration/testing - # purposes. - memory: 400Mi - alerting: - alertmanagers: - - namespace: monitoring - name: alertmanager-main - port: web + - key: k8s-app + operator: Exists + version: v2.2.1 diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml index e7e03a29..8f1b4d36 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 1b13c899..9ecd917e 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -1,4 +1,4 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus-operator @@ -8,13 +8,13 @@ rules: resources: - thirdpartyresources verbs: - - "*" + - '*' - apiGroups: - apiextensions.k8s.io resources: - customresourcedefinitions verbs: - - "*" + - '*' - apiGroups: - monitoring.coreos.com resources: @@ -24,31 +24,45 @@ rules: - alertmanagers/finalizers - servicemonitors verbs: - - "*" + - '*' - apiGroups: - apps resources: - statefulsets - verbs: ["*"] -- apiGroups: [""] + verbs: + - '*' +- apiGroups: + - "" resources: - configmaps - secrets - verbs: ["*"] -- apiGroups: [""] + verbs: + - '*' +- apiGroups: + - "" resources: - pods - verbs: ["list", "delete"] -- apiGroups: [""] + verbs: + - list + - delete +- apiGroups: + - "" resources: - services - - endpoints - verbs: ["get", "create", "update"] -- apiGroups: [""] + verbs: + - get + - create + - update +- apiGroups: + - "" resources: - nodes - verbs: ["list", "watch"] -- apiGroups: [""] + verbs: + - list + - watch +- apiGroups: + - "" resources: - namespaces - verbs: ["list"] + verbs: + - list diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator-deployment.yaml similarity index 97% rename from manifests/prometheus-operator/prometheus-operator.yaml rename to manifests/prometheus-operator/prometheus-operator-deployment.yaml index d0030111..1c70ef0a 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator-deployment.yaml @@ -4,6 +4,7 @@ metadata: labels: k8s-app: prometheus-operator name: prometheus-operator + namespace: monitoring spec: replicas: 1 selector: diff --git a/manifests/prometheus-operator/prometheus-operator-service-account.yaml b/manifests/prometheus-operator/prometheus-operator-service-account.yaml index 38d18cce..11898a6f 100644 --- a/manifests/prometheus-operator/prometheus-operator-service-account.yaml +++ b/manifests/prometheus-operator/prometheus-operator-service-account.yaml @@ -2,3 +2,4 @@ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-operator + namespace: monitoring diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/prometheus-operator/prometheus-operator-service.yaml index 8882d4a7..8a825387 100644 --- a/manifests/prometheus-operator/prometheus-operator-service.yaml +++ b/manifests/prometheus-operator/prometheus-operator-service.yaml @@ -2,14 +2,11 @@ apiVersion: v1 kind: Service metadata: name: prometheus-operator - labels: - k8s-app: prometheus-operator + namespace: monitoring spec: - type: ClusterIP ports: - name: http port: 8080 targetPort: http - protocol: TCP selector: k8s-app: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests/prometheus/prometheus-k8s-role-bindings.yaml deleted file mode 100644 index 5f190e7a..00000000 --- a/manifests/prometheus/prometheus-k8s-role-bindings.yaml +++ /dev/null @@ -1,54 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: monitoring -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: kube-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: prometheus-k8s -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml deleted file mode 100644 index 4f738e77..00000000 --- a/manifests/prometheus/prometheus-k8s-roles.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: Role -metadata: - name: prometheus-k8s - namespace: monitoring -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: Role -metadata: - name: prometheus-k8s - namespace: kube-system -rules: -- apiGroups: [""] - resources: - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: Role -metadata: - name: prometheus-k8s - namespace: default -rules: -- apiGroups: [""] - resources: - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: prometheus-k8s -rules: -- apiGroups: [""] - resources: - - nodes/metrics - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml deleted file mode 100644 index cd90a55e..00000000 --- a/manifests/prometheus/prometheus-k8s-service-coredns-metrics.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: coredns-prometheus-discovery - namespace: kube-system - labels: - k8s-app: coredns - component: metrics -spec: - ports: - - name: http-metrics - port: 9153 - protocol: TCP - targetPort: 9153 - selector: - k8s-app: coredns - type: ClusterIP - clusterIP: None diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index fdc2b200..00000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/aknuds1/grafanalib.git@v0.4.0 \ No newline at end of file From 3b13afb5624706727d32ef17c0c7e09deff6c9fe Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Fri, 13 Apr 2018 15:10:18 +0200 Subject: [PATCH 230/638] kube-prometheus/thanos: Move to experimental folder In preparation of PR #1206 [1], This patch moves the Thanos manifests to the `experimental` folder. [1] https://github.com/coreos/prometheus-operator/pull/1206 --- {manifests => experimental}/thanos/prometheus-self.yaml | 0 {manifests => experimental}/thanos/query.yaml | 0 {manifests => experimental}/thanos/thanos-peers-svc.yaml | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {manifests => experimental}/thanos/prometheus-self.yaml (100%) rename {manifests => experimental}/thanos/query.yaml (100%) rename {manifests => experimental}/thanos/thanos-peers-svc.yaml (100%) diff --git a/manifests/thanos/prometheus-self.yaml b/experimental/thanos/prometheus-self.yaml similarity index 100% rename from manifests/thanos/prometheus-self.yaml rename to experimental/thanos/prometheus-self.yaml diff --git a/manifests/thanos/query.yaml b/experimental/thanos/query.yaml similarity index 100% rename from manifests/thanos/query.yaml rename to experimental/thanos/query.yaml diff --git a/manifests/thanos/thanos-peers-svc.yaml b/experimental/thanos/thanos-peers-svc.yaml similarity index 100% rename from manifests/thanos/thanos-peers-svc.yaml rename to experimental/thanos/thanos-peers-svc.yaml From 73b3259f62531c356426c7bacae0550065a0d93d Mon Sep 17 00:00:00 2001 From: naseemkullah Date: Sun, 15 Apr 2018 02:01:30 -0400 Subject: [PATCH 231/638] Update kube-prometheus-on-kubeadm.md In kubeadm 1.10 (and perhaps earlier versions, not sure), ctrl-mgr and scheduler pods' label key is `component=` rather than `k8s-app=`, which is the selector key used for kube-prometheus exporters for these components. --- docs/kube-prometheus-on-kubeadm.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index 78a499e0..bef2e3cc 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -42,7 +42,7 @@ schedulerExtraArgs: address: 0.0.0.0 ``` -Notice the `schedulerExtraArgs` and `controllerManagerExtraArgs`. This exposes the `kube-controller-manager` and `kube-scheduler` services to the rest of the cluster. +Notice the `schedulerExtraArgs` and `controllerManagerExtraArgs`. This exposes the `kube-controller-manager` and `kube-scheduler` services to the rest of the cluster. If you have kubernetes core components as pods in the kube-system namespace, ensure that the `kube-prometheus-exporter-kube-scheduler` and `kube-prometheus-exporter-kube-controller-manager` services' `spec.selector` values match those of pods. In addition, we will be using `node-exporter` to monitor the `cAdvisor` service on all the nodes. This, however requires a change to the `kubelet` service on the master as well as all the nodes. According to the Kubernetes documentation From e3e510569954be216c4212eff6c2264466a442f8 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson Date: Wed, 18 Apr 2018 11:15:18 +0200 Subject: [PATCH 232/638] added rbac permissions for operator to work properly --- .../prometheus-operator-cluster-role.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 9ecd917e..74be0473 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -66,3 +66,11 @@ rules: - namespaces verbs: - list +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - create + - update \ No newline at end of file From 00df72cf5e60271a131d32753c5c198a2a39b271 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson Date: Wed, 18 Apr 2018 11:20:45 +0200 Subject: [PATCH 233/638] cleanup --- .../prometheus-operator-cluster-role.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml index 74be0473..76f943df 100644 --- a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -49,6 +49,7 @@ rules: - "" resources: - services + - endpoints verbs: - get - create @@ -66,11 +67,3 @@ rules: - namespaces verbs: - list -- apiGroups: - - "" - resources: - - endpoints - verbs: - - get - - create - - update \ No newline at end of file From f6dae8bd711a47e92239a2e5736e7ac328cc8fe9 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson Date: Wed, 18 Apr 2018 12:52:58 +0200 Subject: [PATCH 234/638] updated jsonnet code --- .../prometheus-operator-cluster-role.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet index db8bcd7b..858d7542 100644 --- a/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet +++ b/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet @@ -53,6 +53,7 @@ local routingRule = policyRule.new() + policyRule.withApiGroups([""]) + policyRule.withResources([ "services", + "endpoints", ]) + policyRule.withVerbs(["get", "create", "update"]); From 971ed1010e52601030a2fb1b9b91c166e412fec2 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Wed, 18 Apr 2018 12:52:05 +0200 Subject: [PATCH 235/638] kube-prometheus: Move service monitors to target folder With 1d00e0ab ("ServiceMonitor: Enable Prometheus to select ServMon outside own ns") merged: To emphasize the fact, that ServiceMonitors can be deployed alongside their targets, this patch moves the Alertmanager, Kube-State-Metrics, Node-Exporter and Prometheus Operator Service-Monitor into the respective target folder in the kube-prometheus project. --- .../alertmanager-main-service-monitor.libsonnet} | 0 jsonnet/alertmanager/alertmanager.libsonnet | 1 + jsonnet/kube-prometheus.libsonnet | 8 ++++---- .../kube-state-metrics-service-monitor.libsonnet} | 0 .../kube-state-metrics/kube-state-metrics.libsonnet | 1 + .../node-exporter-service-monitor.libsonnet} | 0 jsonnet/node-exporter/node-exporter.libsonnet | 1 + .../prometheus-operator-service-monitor.libsonnet} | 0 .../prometheus-operator/prometheus-operator.libsonnet | 11 ++++++----- jsonnet/prometheus/prometheus.libsonnet | 4 ---- .../alertmanager-main-service-monitor.yaml} | 0 .../kube-state-metrics-service-monitor.yaml} | 0 .../node-exporter-service-monitor.yaml} | 0 .../prometheus-operator-service-monitor.yaml} | 0 14 files changed, 13 insertions(+), 13 deletions(-) rename jsonnet/{prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet => alertmanager/alertmanager-main-service-monitor.libsonnet} (100%) rename jsonnet/{prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet => kube-state-metrics/kube-state-metrics-service-monitor.libsonnet} (100%) rename jsonnet/{prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet => node-exporter/node-exporter-service-monitor.libsonnet} (100%) rename jsonnet/{prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet => prometheus-operator/prometheus-operator-service-monitor.libsonnet} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml => alertmanager-main/alertmanager-main-service-monitor.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml => kube-state-metrics/kube-state-metrics-service-monitor.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml => node-exporter/node-exporter-service-monitor.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml => prometheus-operator/prometheus-operator-service-monitor.yaml} (100%) diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet b/jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet similarity index 100% rename from jsonnet/prometheus/prometheus-k8s-service-monitor-alertmanager.libsonnet rename to jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet diff --git a/jsonnet/alertmanager/alertmanager.libsonnet b/jsonnet/alertmanager/alertmanager.libsonnet index ef837aba..ec3954c3 100644 --- a/jsonnet/alertmanager/alertmanager.libsonnet +++ b/jsonnet/alertmanager/alertmanager.libsonnet @@ -2,5 +2,6 @@ config:: import "alertmanager-main-secret.libsonnet", serviceAccount:: import "alertmanager-main-service-account.libsonnet", service:: import "alertmanager-main-service.libsonnet", + serviceMonitor:: import "alertmanager-main-service-monitor.libsonnet", alertmanager:: import "alertmanager-main.libsonnet", } diff --git a/jsonnet/kube-prometheus.libsonnet b/jsonnet/kube-prometheus.libsonnet index 44ee06a5..14864056 100644 --- a/jsonnet/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus.libsonnet @@ -35,6 +35,7 @@ local ruleFiles = { "alertmanager-main/alertmanager-main-secret.yaml": alertmanager.config.new(namespace, alertmanagerConfig), "alertmanager-main/alertmanager-main-service-account.yaml": alertmanager.serviceAccount.new(namespace), "alertmanager-main/alertmanager-main-service.yaml": alertmanager.service.new(namespace), + "alertmanager-main/alertmanager-main-service-monitor.yaml": alertmanager.serviceMonitor.new(namespace), "alertmanager-main/alertmanager-main.yaml": alertmanager.alertmanager.new(namespace), "kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml": ksm.clusterRoleBinding.new(namespace), @@ -44,17 +45,20 @@ local ruleFiles = { "kube-state-metrics/kube-state-metrics-role.yaml": ksm.role.new(namespace), "kube-state-metrics/kube-state-metrics-service-account.yaml": ksm.serviceAccount.new(namespace), "kube-state-metrics/kube-state-metrics-service.yaml": ksm.service.new(namespace), + "kube-state-metrics/kube-state-metrics-service-monitor.yaml": ksm.serviceMonitor.new(namespace), "node-exporter/node-exporter-cluster-role-binding.yaml": nodeExporter.clusterRoleBinding.new(namespace), "node-exporter/node-exporter-cluster-role.yaml": nodeExporter.clusterRole.new(), "node-exporter/node-exporter-daemonset.yaml": nodeExporter.daemonset.new(namespace), "node-exporter/node-exporter-service-account.yaml": nodeExporter.serviceAccount.new(namespace), "node-exporter/node-exporter-service.yaml": nodeExporter.service.new(namespace), + "node-exporter/node-exporter-service-monitor.yaml": nodeExporter.serviceMonitor.new(namespace), "prometheus-operator/prometheus-operator-cluster-role-binding.yaml": po.clusterRoleBinding.new(namespace), "prometheus-operator/prometheus-operator-cluster-role.yaml": po.clusterRole.new(), "prometheus-operator/prometheus-operator-deployment.yaml": po.deployment.new(namespace), "prometheus-operator/prometheus-operator-service.yaml": po.service.new(namespace), + "prometheus-operator/prometheus-operator-service-monitor.yaml": po.serviceMonitor.new(namespace), "prometheus-operator/prometheus-operator-service-account.yaml": po.serviceAccount.new(namespace), "prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml": prometheus.clusterRoleBinding.new(namespace), @@ -71,15 +75,11 @@ local ruleFiles = { "prometheus-k8s/prometheus-k8s-role-namespace.yaml": prometheus.roleNamespace.new(namespace), "prometheus-k8s/prometheus-k8s-role-kube-system.yaml": prometheus.roleKubeSystem.new(), "prometheus-k8s/prometheus-k8s-role-default.yaml": prometheus.roleDefault.new(), - "prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml": prometheus.serviceMonitorAlertmanager.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml": prometheus.serviceMonitorApiserver.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml": prometheus.serviceMonitorCoreDNS.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml": prometheus.serviceMonitorControllerManager.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml": prometheus.serviceMonitorScheduler.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml": prometheus.serviceMonitorKubeStateMetrics.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml": prometheus.serviceMonitorKubelet.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml": prometheus.serviceMonitorNodeExporter.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml": prometheus.serviceMonitorPrometheusOperator.new(namespace), "prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml": prometheus.serviceMonitorPrometheus.new(namespace), } } diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet similarity index 100% rename from jsonnet/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.libsonnet rename to jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet diff --git a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet index 3f9b8ba2..d82765f6 100644 --- a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet @@ -6,4 +6,5 @@ role:: import "kube-state-metrics-role.libsonnet", serviceAccount:: import "kube-state-metrics-service-account.libsonnet", service:: import "kube-state-metrics-service.libsonnet", + serviceMonitor:: import "kube-state-metrics-service-monitor.libsonnet", } diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet b/jsonnet/node-exporter/node-exporter-service-monitor.libsonnet similarity index 100% rename from jsonnet/prometheus/prometheus-k8s-service-monitor-node-exporter.libsonnet rename to jsonnet/node-exporter/node-exporter-service-monitor.libsonnet diff --git a/jsonnet/node-exporter/node-exporter.libsonnet b/jsonnet/node-exporter/node-exporter.libsonnet index 57e67911..5438f001 100644 --- a/jsonnet/node-exporter/node-exporter.libsonnet +++ b/jsonnet/node-exporter/node-exporter.libsonnet @@ -4,4 +4,5 @@ daemonset:: import "node-exporter-daemonset.libsonnet", serviceAccount:: import "node-exporter-service-account.libsonnet", service:: import "node-exporter-service.libsonnet", + serviceMonitor:: import "node-exporter-service-monitor.libsonnet", } diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet similarity index 100% rename from jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus-operator.libsonnet rename to jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet diff --git a/jsonnet/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/prometheus-operator/prometheus-operator.libsonnet index 849acbbf..3659250d 100644 --- a/jsonnet/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/prometheus-operator/prometheus-operator.libsonnet @@ -1,7 +1,8 @@ { - clusterRoleBinding:: import "prometheus-operator-cluster-role-binding.libsonnet", - clusterRole:: import "prometheus-operator-cluster-role.libsonnet", - deployment:: import "prometheus-operator-deployment.libsonnet", - serviceAccount:: import "prometheus-operator-service-account.libsonnet", - service:: import "prometheus-operator-service.libsonnet", + clusterRoleBinding:: import "prometheus-operator-cluster-role-binding.libsonnet", + clusterRole:: import "prometheus-operator-cluster-role.libsonnet", + deployment:: import "prometheus-operator-deployment.libsonnet", + serviceAccount:: import "prometheus-operator-service-account.libsonnet", + service:: import "prometheus-operator-service.libsonnet", + serviceMonitor:: import "prometheus-operator-service-monitor.libsonnet", } diff --git a/jsonnet/prometheus/prometheus.libsonnet b/jsonnet/prometheus/prometheus.libsonnet index 99fb9f54..1ba4f55d 100644 --- a/jsonnet/prometheus/prometheus.libsonnet +++ b/jsonnet/prometheus/prometheus.libsonnet @@ -11,15 +11,11 @@ roleDefault:: import "prometheus-k8s-role-default.libsonnet", rules:: import "prometheus-k8s-rules.libsonnet", serviceAccount:: import "prometheus-k8s-service-account.libsonnet", - serviceMonitorAlertmanager:: import "prometheus-k8s-service-monitor-alertmanager.libsonnet", serviceMonitorApiserver:: import "prometheus-k8s-service-monitor-apiserver.libsonnet", serviceMonitorCoreDNS:: import "prometheus-k8s-service-monitor-coredns.libsonnet", serviceMonitorControllerManager:: import "prometheus-k8s-service-monitor-kube-controller-manager.libsonnet", serviceMonitorScheduler:: import "prometheus-k8s-service-monitor-kube-scheduler.libsonnet", - serviceMonitorKubeStateMetrics:: import "prometheus-k8s-service-monitor-kube-state-metrics.libsonnet", serviceMonitorKubelet:: import "prometheus-k8s-service-monitor-kubelet.libsonnet", - serviceMonitorNodeExporter:: import "prometheus-k8s-service-monitor-node-exporter.libsonnet", - serviceMonitorPrometheusOperator:: import "prometheus-k8s-service-monitor-prometheus-operator.libsonnet", serviceMonitorPrometheus:: import "prometheus-k8s-service-monitor-prometheus.libsonnet", service:: import "prometheus-k8s-service.libsonnet", prometheus:: import "prometheus-k8s.libsonnet", diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/alertmanager-main/alertmanager-main-service-monitor.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-alertmanager.yaml rename to manifests/alertmanager-main/alertmanager-main-service-monitor.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/kube-state-metrics/kube-state-metrics-service-monitor.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-state-metrics.yaml rename to manifests/kube-state-metrics/kube-state-metrics-service-monitor.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/node-exporter/node-exporter-service-monitor.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-node-exporter.yaml rename to manifests/node-exporter/node-exporter-service-monitor.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator-service-monitor.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus-operator.yaml rename to manifests/prometheus-operator/prometheus-operator-service-monitor.yaml From 7c07940cb8d823d0555bb39c544eb1a22e22e125 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 22 Apr 2018 10:50:26 +0100 Subject: [PATCH 236/638] kube-prometheus: Fix config reloader RBAC --- .../prometheus-k8s-role-binding-config.libsonnet | 2 +- .../prometheus-k8s-role-binding-default.libsonnet | 2 +- .../prometheus-k8s-role-binding-kube-system.libsonnet | 2 +- .../prometheus-k8s-role-binding-namespace.libsonnet | 2 +- .../prometheus-namespace-role-binding.libsonnet | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet index 2319aa35..631e5fa5 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet index f5d38ce7..e88ece99 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet index 04c481ca..33967e0a 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet index 7833f785..d70ed6ac 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet index 8b255fa0..a63bcc9c 100644 --- a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet +++ b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet @@ -2,12 +2,12 @@ local k = import "ksonnet.beta.3/k.libsonnet"; local roleBinding = k.rbac.v1.roleBinding; { - new(serviceAccountNamespace, namespace, name):: + new(serviceAccountNamespace, namespace, roleName, serviceAccountName):: roleBinding.new() + - roleBinding.mixin.metadata.withName(name) + + roleBinding.mixin.metadata.withName(roleName) + roleBinding.mixin.metadata.withNamespace(namespace) + roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - roleBinding.mixin.roleRef.withName(name) + + roleBinding.mixin.roleRef.withName(roleName) + roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + - roleBinding.withSubjects([{kind: "ServiceAccount", name: name, namespace: serviceAccountNamespace}]) + roleBinding.withSubjects([{kind: "ServiceAccount", name: serviceAccountName, namespace: serviceAccountNamespace}]) } From 0610c45e136483da35eaabe484dcca92d0f7c9ad Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 22 Apr 2018 10:50:58 +0100 Subject: [PATCH 237/638] kube-prometheus: Fix rule label selector --- jsonnet/prometheus/prometheus-k8s-rules.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet b/jsonnet/prometheus/prometheus-k8s-rules.libsonnet index abe98fa9..d2014569 100644 --- a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-rules.libsonnet @@ -4,5 +4,6 @@ local configMap = k.core.v1.configMap; { new(namespace, ruleFiles):: configMap.new("prometheus-k8s-rules", ruleFiles) + + configMap.mixin.metadata.withLabels({role: "alert-rules", prometheus: "k8s"}) + configMap.mixin.metadata.withNamespace(namespace) } From 2afaeab294b65483a6376b7e9e9e672beca531f9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 22 Apr 2018 10:51:31 +0100 Subject: [PATCH 238/638] kube-prometheus: Update to latest kubernetes-grafana --- .../grafana-dashboard-definitions.yaml | 1974 +++++++++++++++++ .../prometheus-k8s-role-binding-config.yaml | 2 +- .../prometheus-k8s/prometheus-k8s-rules.yaml | 3 + 3 files changed, 1978 insertions(+), 1 deletion(-) diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests/grafana/grafana-dashboard-definitions.yaml index df4c5203..573281af 100644 --- a/manifests/grafana/grafana-dashboard-definitions.yaml +++ b/manifests/grafana/grafana-dashboard-definitions.yaml @@ -3502,6 +3502,714 @@ data: "title": "Kubernetes Cluster Status", "version": 0 } + kubernetes-control-plane-status-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "Controller Mangers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=\u007e\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "API Request Error Rate", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "API Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(apiserver_request_count{code!\u007e\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Error Rate", + "refId": "A" + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "API Request Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "End to End Scheduling Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Control Plane Status", + "version": 0 + } kubernetes-kubelet-dashboard.json: |- { "annotations": { @@ -4053,6 +4761,445 @@ data: "title": "Kubelet", "version": 0 } + kubernetes-resource-requests-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A" + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Cores", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A" + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 0 + } nodes.json: |- { "annotations": { @@ -5397,6 +6544,833 @@ data: "title": "Pods", "version": 0 } + statefulset-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}) / 1024^3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_replicas{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas_current{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Replicas of current version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas specified", + "refId": "A" + }, + { + "expr": "max(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas created", + "refId": "B" + }, + { + "expr": "min(kube_statefulset_status_replicas_ready{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "ready", + "refId": "C" + }, + { + "expr": "min(kube_statefulset_status_replicas_current{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas of current version", + "refId": "D" + }, + { + "expr": "min(kube_statefulset_status_replicas_updated{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "E" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "statefulset_namespace", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Name", + "multi": false, + "name": "statefulset_name", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "StatefulSets", + "version": 0 + } kind: ConfigMap metadata: name: grafana-dashboard-definitions diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml index f21ff3c5..ec0129db 100644 --- a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml @@ -9,5 +9,5 @@ roleRef: name: prometheus-k8s-config subjects: - kind: ServiceAccount - name: prometheus-k8s-config + name: prometheus-k8s namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml index 90cb3f3e..0c03de56 100644 --- a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml @@ -583,5 +583,8 @@ data: rejected\n" kind: ConfigMap metadata: + labels: + prometheus: k8s + role: alert-rules name: prometheus-k8s-rules namespace: monitoring From 754ba959918db8e95cb1bebab94e0f0112c5876e Mon Sep 17 00:00:00 2001 From: naseemkullah Date: Sun, 22 Apr 2018 22:31:04 -0400 Subject: [PATCH 239/638] Update Monitoring external etcd.md --- docs/Monitoring external etcd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Monitoring external etcd.md b/docs/Monitoring external etcd.md index edf9af42..f46f4010 100644 --- a/docs/Monitoring external etcd.md +++ b/docs/Monitoring external etcd.md @@ -46,7 +46,7 @@ If your Prometheus Operator is already in place, update it: The below manifest creates a Service to expose etcd metrics (port 2379) -* Replace I`P_OF_YOUR_ETCD_NODE_[0/1/2]` with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. +* Replace `IP_OF_YOUR_ETCD_NODE_[0/1/2]` with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. * Use `#insecureSkipVerify: true` or replace `ETCD_DNS_OR_ALTERNAME_NAME` with a valid name for the certificate. In case you have generated the etcd certificated with kube-aws, you will need to use insecureSkipVerify as the valid certificate domain will be different for each etcd node (etcd0, etcd1, etcd2). If you only have one etcd node, you can use the value from `etcd.internalDomainName` speficied in your kube-aws `cluster.yaml` From 8c49a641e2b412985ba9891b8ab768bb5b97e70e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 22 Apr 2018 17:17:51 +0100 Subject: [PATCH 240/638] kube-prometheus/hack: Fix cyclic ServiceMonitor dependency Previously the script errored out, as ServiceMonitor objects are only registered once the Prometheus Operator is running. --- hack/cluster-monitoring/deploy | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 41e05187..9e1b4881 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -1,10 +1,18 @@ #!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u +# print each command before executing it +set -x manifest_prefix=${1-.} kubectl create namespace monitoring -kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/ +find ${manifest_prefix}/manifests/prometheus-operator/ -type f ! -name prometheus-operator-service-monitor.yaml -exec kubectl apply -f {} \; # Wait for CRDs to be ready. printf "Waiting for Operator to register custom resource definitions..." @@ -16,9 +24,15 @@ until kubectl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep until kubectl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" +# need to ensure that ServiceMonitors are registered before we can create the prometheus-operator ServiceMonitor +kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/prometheus-operator-service-monitor.yaml + kubectl apply -f ${manifest_prefix}/manifests/node-exporter/ kubectl apply -f ${manifest_prefix}/manifests/kube-state-metrics/ -kubectl apply -f ${manifest_prefix}/manifests/grafana/ +find ${manifest_prefix}/manifests/grafana/ -type f ! -name grafana-dashboard-definitions.yaml -exec kubectl apply -f {} \; + +# kubectl apply wants to put the previous version in an annotation, which is too large, therefore create instead of apply +kubectl create -f ${manifest_prefix}/manifests/grafana/grafana-dashboard-definitions.yaml kubectl apply -f ${manifest_prefix}/manifests/prometheus-k8s/ kubectl apply -f ${manifest_prefix}/manifests/alertmanager-main/ From 45076a6a1f44f800a02e46a735db95751c50dce7 Mon Sep 17 00:00:00 2001 From: Brandon Dimcheff Date: Wed, 25 Apr 2018 21:48:21 -0400 Subject: [PATCH 241/638] kube-prometheus: fix addon-resizer role binding The addon resizer was referring to an incorrect role binding and was unable to access the API. This changes the rolebinding name to refer to the correctly defined rolebinding. --- .../kube-state-metrics-role-binding.libsonnet | 2 +- .../kube-state-metrics/kube-state-metrics-role-binding.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet index bd9b03ae..02a43b7a 100644 --- a/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet +++ b/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet @@ -7,7 +7,7 @@ local roleBinding = k.rbac.v1.roleBinding; roleBinding.mixin.metadata.withName("kube-state-metrics") + roleBinding.mixin.metadata.withNamespace(namespace) + roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - roleBinding.mixin.roleRef.withName("kube-state-metrics-addon-resizer") + + roleBinding.mixin.roleRef.withName("kube-state-metrics") + roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + roleBinding.withSubjects([{kind: "ServiceAccount", name: "kube-state-metrics"}]) } diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml index dcad8055..9c61143c 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml +++ b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml @@ -6,7 +6,7 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: kube-state-metrics-addon-resizer + name: kube-state-metrics subjects: - kind: ServiceAccount name: kube-state-metrics From 157ad7eaeadfecfe173dc9dabf65778271828f32 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Wed, 25 Apr 2018 17:59:09 +0200 Subject: [PATCH 242/638] kube-prometheus: Update Prometheus Operator version before generating --- .../prometheus-operator-deployment.libsonnet | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet index 0212bc96..35eaa674 100644 --- a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet +++ b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet @@ -1,8 +1,6 @@ local k = import "ksonnet.beta.3/k.libsonnet"; -local rawVersion = importstr "../../../../VERSION"; -local removeLineBreaks = function(str) std.join("", std.filter(function(c) c != "\n", std.stringChars(str))); -local version = "v0.18.1";//removeLineBreaks(rawVersion); +local version = "v0.18.1"; local deployment = k.apps.v1beta2.deployment; local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; From 992bcdccc1efc5e346dcac0d1135d4133b3933f6 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 26 Apr 2018 10:58:58 +0200 Subject: [PATCH 243/638] *: Bump version to v0.19.0 --- .../prometheus-operator-deployment.libsonnet | 2 +- .../prometheus-operator/prometheus-operator-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet index 35eaa674..f8c9ca2a 100644 --- a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet +++ b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet @@ -1,6 +1,6 @@ local k = import "ksonnet.beta.3/k.libsonnet"; -local version = "v0.18.1"; +local version = "v0.19.0"; local deployment = k.apps.v1beta2.deployment; local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; diff --git a/manifests/prometheus-operator/prometheus-operator-deployment.yaml b/manifests/prometheus-operator/prometheus-operator-deployment.yaml index 1c70ef0a..ac744b2a 100644 --- a/manifests/prometheus-operator/prometheus-operator-deployment.yaml +++ b/manifests/prometheus-operator/prometheus-operator-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - image: quay.io/coreos/prometheus-operator:v0.18.1 + image: quay.io/coreos/prometheus-operator:v0.19.0 name: prometheus-operator ports: - containerPort: 8080 From edf21e4382401bb778c8026e1b42e2fd5a77a6f4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 25 Apr 2018 15:04:20 +0100 Subject: [PATCH 244/638] kube-prometheus/jsonnet: Use jsonnet-bundler --- .gitignore | 1 + Makefile | 8 +- README.md | 351 +- example-dist/base/kube-prometheus.jsonnet | 6 - example-dist/bootkube/.gitignore | 2 - example-dist/bootkube/kube-prometheus.jsonnet | 36 - example-dist/kubeadm/.gitignore | 2 - example-dist/kubeadm/kube-prometheus.jsonnet | 31 - examples/bootkube.jsonnet | 2 + examples/ksonnet-example.jsonnet | 9 + examples/kubeadm.jsonnet | 2 + examples/node-ports.jsonnet | 2 + examples/prometheus-name-override.jsonnet | 9 + hack/cluster-monitoring/deploy | 18 +- hack/scripts/build-jsonnet.sh | 10 +- hack/scripts/kube-prometheus-base.jsonnet | 12 + hack/scripts/kube-prometheus-minikube.jsonnet | 16 + .../alertmanager-main-secret.libsonnet | 8 - ...lertmanager-main-service-account.libsonnet | 8 - ...lertmanager-main-service-monitor.libsonnet | 32 - .../alertmanager-main-service.libsonnet | 12 - .../alertmanager/alertmanager-main.libsonnet | 19 - jsonnet/alertmanager/alertmanager.libsonnet | 7 - jsonnet/kube-prometheus.libsonnet | 85 - jsonnet/kube-prometheus/.gitignore | 2 + .../alertmanager/alertmanager.libsonnet | 97 + jsonnet/kube-prometheus/jsonnetfile.json | 34 + .../kube-prometheus-bootkube.libsonnet | 23 + .../kube-prometheus-ksonnet.libsonnet | 8 + .../kube-prometheus-kubeadm.libsonnet | 18 + .../kube-prometheus-node-ports.libsonnet | 21 + .../kube-prometheus/kube-prometheus.libsonnet | 26 + .../kube-state-metrics.libsonnet | 286 + .../node-exporter/node-exporter.libsonnet | 167 + .../alertmanager-crd.libsonnet | 1 + .../prometheus-crd.libsonnet | 1 + .../prometheus-operator.libsonnet | 152 + .../servicemonitor-crd.libsonnet | 1 + .../prometheus/prometheus.libsonnet | 454 + ...ate-metrics-cluster-role-binding.libsonnet | 12 - .../kube-state-metrics-cluster-role.libsonnet | 75 - .../kube-state-metrics-deployment.libsonnet | 86 - .../kube-state-metrics-role-binding.libsonnet | 13 - .../kube-state-metrics-role.libsonnet | 28 - ...be-state-metrics-service-account.libsonnet | 8 - ...be-state-metrics-service-monitor.libsonnet | 48 - .../kube-state-metrics-service.libsonnet | 15 - .../kube-state-metrics.libsonnet | 10 - ...de-exporter-cluster-role-binding.libsonnet | 12 - .../node-exporter-cluster-role.libsonnet | 26 - .../node-exporter-daemonset.libsonnet | 58 - .../node-exporter-service-account.libsonnet | 8 - .../node-exporter-service-monitor.libsonnet | 38 - .../node-exporter-service.libsonnet | 14 - jsonnet/node-exporter/node-exporter.libsonnet | 8 - ...us-operator-cluster-role-binding.libsonnet | 12 - ...prometheus-operator-cluster-role.libsonnet | 81 - .../prometheus-operator-deployment.libsonnet | 28 - ...metheus-operator-service-account.libsonnet | 8 - ...metheus-operator-service-monitor.libsonnet | 26 - .../prometheus-operator-service.libsonnet | 14 - .../prometheus-operator.libsonnet | 8 - ...metheus-k8s-cluster-role-binding.libsonnet | 12 - .../prometheus-k8s-cluster-role.libsonnet | 21 - ...ometheus-k8s-role-binding-config.libsonnet | 5 - ...metheus-k8s-role-binding-default.libsonnet | 5 - ...eus-k8s-role-binding-kube-system.libsonnet | 5 - ...theus-k8s-role-binding-namespace.libsonnet | 5 - .../prometheus-k8s-role-config.libsonnet | 18 - .../prometheus-k8s-role-default.libsonnet | 5 - .../prometheus-k8s-role-kube-system.libsonnet | 5 - .../prometheus-k8s-role-namespace.libsonnet | 5 - .../prometheus/prometheus-k8s-rules.libsonnet | 9 - .../prometheus-k8s-service-account.libsonnet | 8 - ...us-k8s-service-monitor-apiserver.libsonnet | 40 - ...heus-k8s-service-monitor-coredns.libsonnet | 35 - ...-monitor-kube-controller-manager.libsonnet | 33 - ...s-service-monitor-kube-scheduler.libsonnet | 33 - ...heus-k8s-service-monitor-kubelet.libsonnet | 49 - ...s-k8s-service-monitor-prometheus.libsonnet | 32 - .../prometheus-k8s-service.libsonnet | 12 - jsonnet/prometheus/prometheus-k8s.libsonnet | 43 - ...rometheus-namespace-role-binding.libsonnet | 13 - .../prometheus-namespace-role.libsonnet | 21 - jsonnet/prometheus/prometheus.libsonnet | 22 - ...ertmanager-custom-resource-definition.yaml | 2270 ++++ ...prometheus-custom-resource-definition.yaml | 2688 +++++ ...icemonitor-custom-resource-definition.yaml | 236 + ...etheus-operator-cluster-role-binding.yaml} | 0 ...=> 0prometheus-operator-cluster-role.yaml} | 0 ...l => 0prometheus-operator-deployment.yaml} | 2 + ...0prometheus-operator-service-account.yaml} | 0 ...yaml => 0prometheus-operator-service.yaml} | 3 + ...in.yaml => alertmanager-alertmanager.yaml} | 3 + .../alertmanager-main-secret.yaml | 8 - manifests/alertmanager-secret.yaml | 8 + ...yaml => alertmanager-service-account.yaml} | 0 ...yaml => alertmanager-service-monitor.yaml} | 0 ...service.yaml => alertmanager-service.yaml} | 0 ...aml => grafana-dashboard-datasources.yaml} | 0 .../grafana-dashboard-definitions.yaml | 9679 ++++++++--------- .../grafana-dashboard-sources.yaml | 0 .../{grafana => }/grafana-deployment.yaml | 8 +- .../grafana-service-account.yaml | 0 manifests/{grafana => }/grafana-service.yaml | 0 ...be-state-metrics-cluster-role-binding.yaml | 0 .../kube-state-metrics-cluster-role.yaml | 0 .../kube-state-metrics-deployment.yaml | 2 + .../kube-state-metrics-role-binding.yaml | 0 .../kube-state-metrics-role.yaml | 0 .../kube-state-metrics-service-account.yaml | 0 .../kube-state-metrics-service-monitor.yaml | 0 .../kube-state-metrics-service.yaml | 1 + .../node-exporter-cluster-role-binding.yaml | 0 .../node-exporter-cluster-role.yaml | 0 .../node-exporter-daemonset.yaml | 5 + .../node-exporter-service-account.yaml | 0 .../node-exporter-service-monitor.yaml | 0 .../node-exporter-service.yaml | 1 + ...l => prometheus-cluster-role-binding.yaml} | 0 ...role.yaml => prometheus-cluster-role.yaml} | 0 .../prometheus-k8s/prometheus-k8s-rules.yaml | 590 - ...us-k8s.yaml => prometheus-prometheus.yaml} | 3 + ...ml => prometheus-role-binding-config.yaml} | 0 ...l => prometheus-role-binding-default.yaml} | 0 ... prometheus-role-binding-kube-system.yaml} | 0 ...=> prometheus-role-binding-namespace.yaml} | 0 ...onfig.yaml => prometheus-role-config.yaml} | 0 ...ault.yaml => prometheus-role-default.yaml} | 0 ....yaml => prometheus-role-kube-system.yaml} | 0 ...ce.yaml => prometheus-role-namespace.yaml} | 0 manifests/prometheus-rules.yaml | 166 + ...t.yaml => prometheus-service-account.yaml} | 0 ...prometheus-service-monitor-apiserver.yaml} | 0 ... prometheus-service-monitor-core-dns.yaml} | 0 ...vice-monitor-kube-controller-manager.yaml} | 0 ...theus-service-monitor-kube-scheduler.yaml} | 0 ...> prometheus-service-monitor-kubelet.yaml} | 0 ...-service-monitor-prometheus-operator.yaml} | 0 ...rometheus-service-monitor-prometheus.yaml} | 0 ...s-service.yaml => prometheus-service.yaml} | 0 141 files changed, 11505 insertions(+), 7219 deletions(-) delete mode 100644 example-dist/base/kube-prometheus.jsonnet delete mode 100644 example-dist/bootkube/.gitignore delete mode 100644 example-dist/bootkube/kube-prometheus.jsonnet delete mode 100644 example-dist/kubeadm/.gitignore delete mode 100644 example-dist/kubeadm/kube-prometheus.jsonnet create mode 100644 examples/bootkube.jsonnet create mode 100644 examples/ksonnet-example.jsonnet create mode 100644 examples/kubeadm.jsonnet create mode 100644 examples/node-ports.jsonnet create mode 100644 examples/prometheus-name-override.jsonnet create mode 100644 hack/scripts/kube-prometheus-base.jsonnet create mode 100644 hack/scripts/kube-prometheus-minikube.jsonnet delete mode 100644 jsonnet/alertmanager/alertmanager-main-secret.libsonnet delete mode 100644 jsonnet/alertmanager/alertmanager-main-service-account.libsonnet delete mode 100644 jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet delete mode 100644 jsonnet/alertmanager/alertmanager-main-service.libsonnet delete mode 100644 jsonnet/alertmanager/alertmanager-main.libsonnet delete mode 100644 jsonnet/alertmanager/alertmanager.libsonnet delete mode 100644 jsonnet/kube-prometheus.libsonnet create mode 100644 jsonnet/kube-prometheus/.gitignore create mode 100644 jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet create mode 100644 jsonnet/kube-prometheus/jsonnetfile.json create mode 100644 jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet create mode 100644 jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet create mode 100644 jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet create mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet create mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet create mode 100644 jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet create mode 100644 jsonnet/kube-prometheus/prometheus/prometheus.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet delete mode 100644 jsonnet/kube-state-metrics/kube-state-metrics.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-cluster-role.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-daemonset.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-service-account.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-service-monitor.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter-service.libsonnet delete mode 100644 jsonnet/node-exporter/node-exporter.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator-service.libsonnet delete mode 100644 jsonnet/prometheus-operator/prometheus-operator.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-config.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-default.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-rules.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-account.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s-service.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-k8s.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet delete mode 100644 jsonnet/prometheus/prometheus-namespace-role.libsonnet delete mode 100644 jsonnet/prometheus/prometheus.libsonnet create mode 100644 manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml create mode 100644 manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml create mode 100644 manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml rename manifests/{prometheus-operator/prometheus-operator-cluster-role-binding.yaml => 0prometheus-operator-cluster-role-binding.yaml} (100%) rename manifests/{prometheus-operator/prometheus-operator-cluster-role.yaml => 0prometheus-operator-cluster-role.yaml} (100%) rename manifests/{prometheus-operator/prometheus-operator-deployment.yaml => 0prometheus-operator-deployment.yaml} (94%) rename manifests/{prometheus-operator/prometheus-operator-service-account.yaml => 0prometheus-operator-service-account.yaml} (100%) rename manifests/{prometheus-operator/prometheus-operator-service.yaml => 0prometheus-operator-service.yaml} (76%) rename manifests/{alertmanager-main/alertmanager-main.yaml => alertmanager-alertmanager.yaml} (69%) delete mode 100644 manifests/alertmanager-main/alertmanager-main-secret.yaml create mode 100644 manifests/alertmanager-secret.yaml rename manifests/{alertmanager-main/alertmanager-main-service-account.yaml => alertmanager-service-account.yaml} (100%) rename manifests/{alertmanager-main/alertmanager-main-service-monitor.yaml => alertmanager-service-monitor.yaml} (100%) rename manifests/{alertmanager-main/alertmanager-main-service.yaml => alertmanager-service.yaml} (100%) rename manifests/{grafana/grafana-datasources.yaml => grafana-dashboard-datasources.yaml} (100%) rename manifests/{grafana => }/grafana-dashboard-definitions.yaml (54%) rename manifests/{grafana => }/grafana-dashboard-sources.yaml (100%) rename manifests/{grafana => }/grafana-deployment.yaml (86%) rename manifests/{grafana => }/grafana-service-account.yaml (100%) rename manifests/{grafana => }/grafana-service.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-cluster-role-binding.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-cluster-role.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-deployment.yaml (97%) rename manifests/{kube-state-metrics => }/kube-state-metrics-role-binding.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-role.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-service-account.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-service-monitor.yaml (100%) rename manifests/{kube-state-metrics => }/kube-state-metrics-service.yaml (94%) rename manifests/{node-exporter => }/node-exporter-cluster-role-binding.yaml (100%) rename manifests/{node-exporter => }/node-exporter-cluster-role.yaml (100%) rename manifests/{node-exporter => }/node-exporter-daemonset.yaml (90%) rename manifests/{node-exporter => }/node-exporter-service-account.yaml (100%) rename manifests/{node-exporter => }/node-exporter-service-monitor.yaml (100%) rename manifests/{node-exporter => }/node-exporter-service.yaml (92%) rename manifests/{prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml => prometheus-cluster-role-binding.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-cluster-role.yaml => prometheus-cluster-role.yaml} (100%) delete mode 100644 manifests/prometheus-k8s/prometheus-k8s-rules.yaml rename manifests/{prometheus-k8s/prometheus-k8s.yaml => prometheus-prometheus.yaml} (85%) rename manifests/{prometheus-k8s/prometheus-k8s-role-binding-config.yaml => prometheus-role-binding-config.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-binding-default.yaml => prometheus-role-binding-default.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml => prometheus-role-binding-kube-system.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml => prometheus-role-binding-namespace.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-config.yaml => prometheus-role-config.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-default.yaml => prometheus-role-default.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-kube-system.yaml => prometheus-role-kube-system.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-role-namespace.yaml => prometheus-role-namespace.yaml} (100%) create mode 100644 manifests/prometheus-rules.yaml rename manifests/{prometheus-k8s/prometheus-k8s-service-account.yaml => prometheus-service-account.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml => prometheus-service-monitor-apiserver.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml => prometheus-service-monitor-core-dns.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml => prometheus-service-monitor-kube-controller-manager.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml => prometheus-service-monitor-kube-scheduler.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml => prometheus-service-monitor-kubelet.yaml} (100%) rename manifests/{prometheus-operator/prometheus-operator-service-monitor.yaml => prometheus-service-monitor-prometheus-operator.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml => prometheus-service-monitor-prometheus.yaml} (100%) rename manifests/{prometheus-k8s/prometheus-k8s-service.yaml => prometheus-service.yaml} (100%) diff --git a/.gitignore b/.gitignore index 3fec32c8..0887fe6e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ tmp/ +minikube-manifests/ diff --git a/Makefile b/Makefile index 8ff81356..90736d61 100644 --- a/Makefile +++ b/Makefile @@ -7,5 +7,11 @@ generate: image @echo ">> Compiling assets and generating Kubernetes manifests" docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw +crdtojsonnet: + cat ../../example/prometheus-operator-crd/alertmanager.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet + cat ../../example/prometheus-operator-crd/prometheus.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet + cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet + generate-raw: - ./hack/scripts/build-jsonnet.sh example-dist/base/kube-prometheus.jsonnet manifests + cd jsonnet/kube-prometheus; jb install + ./hack/scripts/build-jsonnet.sh hack/scripts/kube-prometheus-base.jsonnet manifests diff --git a/README.md b/README.md index 7defae27..ac4497d2 100644 --- a/README.md +++ b/README.md @@ -2,221 +2,184 @@ > Note that everything in the `contrib/kube-prometheus/` directory is experimental and may change significantly at any time. -This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and -[Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -combined with documentation and scripts to provide single-command deployments of end-to-end -Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) (Operator). +This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) using the Prometheus Operator. + +The content of this project is written in [jsonnet](http://jsonnet.org/). This project could both be described as a package as well as a library. + +Components included in this package: + +* The [Prometheus Operator](https://github.com/coreos/prometheus-operator) +* Highly available [Prometheus](https://prometheus.io/) +* Highly available [Alertmanager](https://github.com/prometheus/alertmanager) +* [Prometheus node-exporter](https://github.com/prometheus/node_exporter) +* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) +* [Grafana](https://grafana.com/) + +This stack is meant for cluster monitoring, so it is pre-configured to collect metrics from all Kubernetes components. In addition to that it delivers a default set of dashboards and alerting rules. Many of the useful dashboards and alerts come from the [kubernetes-mixin project](https://github.com/kubernetes-monitoring/kubernetes-mixin), similar to this project it provides composable jsonnet as a library for users to customize to their needs. ## Prerequisites -First, you need a running Kubernetes cluster. If you don't have one, we recommend you create one -with [Tectonic Installer](https://coreos.com/tectonic/docs/latest/). Despite the name, -Tectonic Installer gives you also the choice to create a barebones Kubernetes cluster, without -CoreOS' Tectonic technology. Otherwise, you can simply make use of -[bootkube](https://github.com/kubernetes-incubator/bootkube) or -[minikube](https://github.com/kubernetes/minikube) for local testing. Some sample contents of this -repository are adapted to work with a [multi-node setup](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node) -using [bootkube](https://github.com/kubernetes-incubator/bootkube). +You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control. +### minikube -> We assume that the kubelet uses token authN and authZ, as otherwise -> Prometheus needs a client certificate, which gives it full access to the -> kubelet, rather than just the metrics. Token authN and authZ allows more fine -> grained and easier access control. Simply start minikube with the following -> command (you can of course adapt the version and memory to your needs): -> -> $ minikube delete && minikube start --kubernetes-version=v1.9.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 -> -> In future versions of minikube and kubeadm this will be the default, but for -> the time being, we will have to configure it ourselves. +In order to just try out this stack, start minikube with the following command: -## Monitoring Kubernetes +``` +$ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +``` -The manifests here use the [Prometheus Operator](https://github.com/coreos/prometheus-operator), -which manages Prometheus servers and their configuration in a cluster. With a single command we can -install +## Quickstart -* The Operator itself -* The Prometheus [node_exporter](https://github.com/prometheus/node_exporter) -* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) -* The [Prometheus specification](https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheus) based on which the Operator deploys a Prometheus setup -* A Prometheus configuration covering monitoring of all Kubernetes core components and exporters -* A default set of alerting rules on the cluster components' health -* A Grafana instance serving dashboards on cluster metrics -* A three node highly available Alertmanager cluster +Although this project is intended to be used as a library, a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content our quickly. -Simply run: +Simply create the stack: + +``` +$ kubectl create -f manifests/ +``` + +## Usage + +The content of this project consists of a set of [jsonnet](http://jsonnet.org/) files making up a library to be consumed. + +Install this library in your own project with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install): + +``` +$ mkdir my-kube-prometheus; cd my-kube-prometheus +$ jb init +$ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus +``` + +> `jb` can be installed with `go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb` + +You may wish to not use ksonnet and simply render the generated manifests to files on disk, this can be done with: + +[embedmd]:# (hack/scripts/kube-prometheus-base.jsonnet) +```jsonnet +local kp = (import "kube-prometheus/kube-prometheus.libsonnet") + { + _config+:: { + namespace: "monitoring", + } +}; + +{["0prometheus-operator-"+name+".yaml"]: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator)} + +{["node-exporter-"+name+".yaml"]: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter)} + +{["kube-state-metrics-"+name+".yaml"]: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics)} + +{["alertmanager-"+name+".yaml"]: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager)} + +{["prometheus-"+name+".yaml"]: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus)} + +{["grafana-"+name+".yaml"]: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana)} +``` + +This renders all manifests in a json structure of `{filename: manifest-content}`. To split this into files on disk use: + +> Note you need `jsonnet`, `jq`, `sed`, `tr` and `gojsonyaml` (`go get github.com/brancz/gojsontoyaml`) installed. ```bash -cd contrib/kube-prometheus/ -hack/cluster-monitoring/deploy +jsonnet -J vendor example.jsonnet > tmp.json + +files=$(jq -r 'keys[]' tmp.json) + +for file in ${files}; do + # prepare directory + dir=$(dirname "${file}") + path="${dir}" + mkdir -p ${path} + + # covert file name to snake case with dashes + fullfile=$(echo ${file} | sed -r 's/([a-z0-9])([A-Z])/\1-\L\2/g' | tr '[:upper:]' '[:lower:]') + + # write each value to the path in key; convert multiple times to prettify yaml + jq -r ".[\"${file}\"]" tmp.json | gojsontoyaml -yamltojson | gojsontoyaml > "${fullfile}" +done + +rm tmp.json ``` -After all pods are ready, you can reach each of the UIs by port-forwarding: +## Configuration -* Prometheus UI on node port `kubectl -n monitoring port-forward prometheus-k8s-0 9090` -* Alertmanager UI on node port `kubectl -n monitoring port-forward alertmanager-main-0 9093` -* Grafana on node port `kubectl -n monitoring port-forward $(kubectl get pods -n monitoring -lapp=grafana -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') 3000` - -To tear it all down again, run: - -```bash -hack/cluster-monitoring/teardown -``` - -## Customizing - -As everyone's infrastructure is slightly different, different organizations have different requirements. Thereby there may be modifications you want to do on kube-prometheus to fit your needs. - -The kube-prometheus stack is intended to be a jsonnet library for organizations to consume and use in their own infrastructure repository. Below is an example how it can be used to deploy the stack properly on minikube. - -The three "distribution" examples we have assembled can be found in: - -* `example-dist/base`: contains the plain kube-prometheus stack for organizations to build on. -* `example-dist/kubeadm`: contains the kube-prometheus stack with slight modifications to work properly monitoring kubeadm clusters and exposes UIs on NodePorts for demonstration purposes. -* `example-dist/bootkube`: contains the kube-prometheus stack with slight modifications to work properly on clusters created with bootkube. - -The examples in `example-dist/` are purely meant for demonstration purposes, the `kube-prometheus.jsonnet` file should live in your organizations infrastructure repository and use the kube-prometheus library provided here. - -Examples of additoinal modifications you may want to make could be adding an `Ingress` object for each of the UIs, but the point of this is that as opposed to other solutions out there, this library does not need to yield all possible customization options, it's all up to the user to customize! - -### minikube kubeadm example - -See `example-dist/kubeadm` for an example for deploying on minikube, using the minikube kubeadm bootstrapper. The `example-dist/kubeadm/kube-prometheus.jsonnet` file renders the kube-prometheus manifests using jsonnet and then merges the result with kubeadm specifics, such as information on how to monitor kube-controller-manager and kube-scheduler as created by kubeadm. In addition for demonstration purposes, it converts the services selecting Prometheus, Alertmanager and Grafana to NodePort services. - -Let's give that a try, and create a minikube cluster: +A hidden `_config` field is located at the top level of the object this library provides. These are the available fields with their respective default values: ``` -minikube delete && minikube start --kubernetes-version=v1.9.6 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +{ + _config+:: { + namespace: "default", + + versions+:: { + alertmanager: "v0.14.0", + nodeExporter: "v0.15.2", + kubeStateMetrics: "v1.3.0", + kubeRbacProxy: "v0.3.0", + addonResizer: "1.0", + prometheusOperator: "v0.18.1", + prometheus: "v2.2.1", + }, + + imageRepos+:: { + prometheus: "quay.io/prometheus/prometheus", + alertmanager: "quay.io/prometheus/alertmanager", + kubeStateMetrics: "quay.io/coreos/kube-state-metrics", + kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", + addonResizer: "quay.io/coreos/addon-resizer", + nodeExporter: "quay.io/prometheus/node-exporter", + prometheusOperator: "quay.io/coreos/prometheus-operator", + }, + + prometheus+:: { + replicas: 2, + rules: {}, + }, + + alertmanager+:: { + config: alertmanagerConfig, + replicas: 3, + }, + }, +} ``` -Then we can render the manifests for kubeadm (because we are using the minikube kubeadm bootstrapper): +## Customization -``` -docker run --rm \ - -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ - --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ - po-jsonnet \ - ./hack/scripts/build-jsonnet.sh example-dist/kubeadm/kube-prometheus.jsonnet example-dist/kubeadm/manifests +Jsonnet is a turing complete language, any logic can be reflected in it. It also has powerful merge functionalities, allowing sophisticated customizations of any kind simply by merging it into the object the library provides. + +A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm]() and [bootkube]() clusters there are mixins available to easily configure these: + +kubeadm: +[embedmd]:# (examples/kubeadm.jsonnet) + +bootkube: +[embedmd]:# (examples/bootkube.jsonnet) + +Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: + +[embedmd]:# (examples/node-ports.jsonnet) + +For example the name of the `Prometheus` object provided by this library can be overridden: + +[embedmd]:# (examples/prometheus-name-override.jsonnet) +```jsonnet +((import "kube-prometheus/kube-prometheus.libsonnet") + { + prometheus+: { + prometheus+: { + metadata+: { + name: "my-name", + } + } + } +}).prometheus.prometheus ``` -> Note the `po-jsonnet` docker image is built using [this Dockerfile](/scripts/jsonnet/Dockerfile), you can also build it using `make image` from the `contrib/kube-prometheus` folder. +Standard Kubernetes manifests are all written using [ksonnet-lib](https://github.com/ksonnet/ksonnet-lib/), so they can be modified with the mixins supplied by ksonnet-lib. For example to override the namespace of the node-exporter DaemonSet: -Then the stack can be deployed using +[embedmd]:# (examples/ksonnet-example.jsonnet) +```jsonnet +local k = import "ksonnet/ksonnet.beta.3/k.libsonnet"; +local daemonset = k.apps.v1beta2.daemonSet; +((import "kube-prometheus/kube-prometheus.libsonnet") + { + nodeExporter+: { + daemonset+: + daemonset.mixin.metadata.withNamespace("my-custom-namespace") + + } +}).nodeExporter.daemonset ``` -hack/cluster-monitoring/deploy example-dist/kubeadm -``` - -## Monitoring custom services - -The example manifests in [examples/example-app](/contrib/kube-prometheus/examples/example-app) -deploy a fake service exposing Prometheus metrics. They additionally define a new Prometheus -server and a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/master/Documentation/design.md#servicemonitor), -which specifies how the example service should be monitored. -The Prometheus Operator will deploy and configure the desired Prometheus instance and continuously -manage its life cycle. - -```bash -hack/example-service-monitoring/deploy -``` - -After all pods are ready you can reach the Prometheus server similar to the Prometheus server above: - -```bash -kubectl port-forward prometheus-frontend-0 9090 -``` - -Then you can access Prometheus through `http://localhost:9090/`. - -Teardown: - -```bash -hack/example-service-monitoring/teardown -``` - -## Dashboarding - -The provided manifests deploy a Grafana instance serving dashboards provided via ConfigMaps. -Said ConfigMaps are generated from Python scripts in assets/grafana, that all have the extension -.dashboard.py as they are loaded by the [grafanalib](https://github.com/aknuds1/grafanalib) -Grafana dashboard generator. Bear in mind that we are for now using a fork of grafanalib as -we needed to make extensive changes to it, in order to be able to generate our dashboards. We are -hoping to be able to consolidate our version with the original. - -As such, in order to make changes to the dashboard bundle, you need to change the \*.dashboard.py -files in assets/grafana, eventually add your own, and then run `make generate` in the -kube-prometheus root directory. - -To read more in depth about developing dashboards, read the -[Developing Prometheus Rules and Grafana Dashboards](docs/developing-alerts-and-dashboards.md) -documentation. - -### Reloading of dashboards - -Currently, Grafana does not support serving dashboards from static files. Instead, the `grafana-watcher` -sidecar container aims to emulate the behavior, by keeping the Grafana database always in sync -with the provided ConfigMap. Hence, the Grafana pod is effectively stateless. -This allows managing dashboards via `git` etc. and easily deploying them via CD pipelines. - -In the future, a separate Grafana operator will support gathering dashboards from multiple -ConfigMaps based on label selection. - -WARNING: If you deploy multiple Grafana instances for HA, you must use session affinity. -Otherwise if pods restart the prometheus datasource ID can get out of sync between the pods, -breaking the UI - -## Roadmap - -* Grafana Operator that dynamically discovers and deploys dashboards from ConfigMaps -* KPM/Helm packages to easily provide production-ready cluster-monitoring setup (essentially contents of `hack/cluster-monitoring`) -* Add meta-monitoring to default cluster monitoring setup -* Build out the provided dashboards and alerts for cluster monitoring to have full coverage of all system aspects - -## Monitoring other Cluster Components - -Discovery of API servers and kubelets works the same across all clusters. -Depending on a cluster's setup several other core components, such as etcd or the -scheduler, may be deployed in different ways. -The easiest integration point is for the cluster operator to provide headless services -of all those components to provide a common interface of discovering them. With that -setup they will automatically be discovered by the provided Prometheus configuration. - -For the `kube-scheduler` and `kube-controller-manager` there are headless -services prepared, simply add them to your running cluster: - -```bash -kubectl -n kube-system create -f manifests/k8s/ -``` - -> Hint: if you use this for a cluster not created with bootkube, make sure you -> populate an endpoints object with the address to your `kube-scheduler` and -> `kube-controller-manager`, or adapt the label selectors to match your setup. - -Aside from Kubernetes specific components, etcd is an important part of a -working cluster, but is typically deployed outside of it. This monitoring -setup assumes that it is made visible from within the cluster through a headless -service as well. - -> Note that minikube hides some components like etcd so to see the extend of -> this setup we recommend setting up a [local cluster using bootkube](https://github.com/kubernetes-incubator/bootkube/tree/master/hack/multi-node). - -An example for bootkube's multi-node vagrant setup is [here](/contrib/kube-prometheus/manifests/etcd/etcd-bootkube-vagrant-multi.yaml). - -> Hint: this is merely an example for a local setup. The addresses will have to -> be adapted for a setup, that is not a single etcd bootkube created cluster. - -With that setup the headless services provide endpoint lists consumed by -Prometheus to discover the endpoints as targets: - -```bash -$ kubectl get endpoints --all-namespaces -NAMESPACE NAME ENDPOINTS AGE -default kubernetes 172.17.4.101:443 2h -kube-system kube-controller-manager-prometheus-discovery 10.2.30.2:10252 1h -kube-system kube-scheduler-prometheus-discovery 10.2.30.4:10251 1h -monitoring etcd-k8s 172.17.4.51:2379 1h -``` - -## Other Documentation -[Install Docs for a cluster created with KOPS on AWS](docs/KOPSonAWS.md) diff --git a/example-dist/base/kube-prometheus.jsonnet b/example-dist/base/kube-prometheus.jsonnet deleted file mode 100644 index 01760e65..00000000 --- a/example-dist/base/kube-prometheus.jsonnet +++ /dev/null @@ -1,6 +0,0 @@ -local kubePrometheus = import "kube-prometheus.libsonnet"; - -local namespace = "monitoring"; -local objects = kubePrometheus.new(namespace); - -{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/example-dist/bootkube/.gitignore b/example-dist/bootkube/.gitignore deleted file mode 100644 index 4ea90de6..00000000 --- a/example-dist/bootkube/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -tmp/ -manifests/ diff --git a/example-dist/bootkube/kube-prometheus.jsonnet b/example-dist/bootkube/kube-prometheus.jsonnet deleted file mode 100644 index fa731106..00000000 --- a/example-dist/bootkube/kube-prometheus.jsonnet +++ /dev/null @@ -1,36 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; -local kubePrometheus = import "kube-prometheus.libsonnet"; - -local namespace = "monitoring"; - -local controllerManagerService = service.new("kube-controller-manager-prometheus-discovery", {"k8s-app": "kube-controller-manager"}, servicePort.newNamed("http-metrics", 10252, 10252)) + - service.mixin.metadata.withNamespace("kube-system") + - service.mixin.metadata.withLabels({"k8s-app": "kube-controller-manager"}); - -local schedulerService = service.new("kube-scheduler-prometheus-discovery", {"k8s-app": "kube-scheduler"}, servicePort.newNamed("http-metrics", 10251, 10251)) + - service.mixin.metadata.withNamespace("kube-system") + - service.mixin.metadata.withLabels({"k8s-app": "kube-scheduler"}); - -local kubeDNSService = service.new("kube-dns-prometheus-discovery", {"k8s-app": "kube-dns"}, [servicePort.newNamed("http-metrics-skydns", 10055, 10055), servicePort.newNamed("http-metrics-dnsmasq", 10054, 10054)]) + - service.mixin.metadata.withNamespace("kube-system") + - service.mixin.metadata.withLabels({"k8s-app": "kube-dns"}); - -local objects = kubePrometheus.new(namespace) + - { - "prometheus-k8s/prometheus-k8s-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("web", 9090, "web") + servicePort.withNodePort(30900)) + - service.mixin.spec.withType("NodePort"), - "alertmanager-main/alertmanager-main-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("web", 9093, "web") + servicePort.withNodePort(30903)) + - service.mixin.spec.withType("NodePort"), - "grafana/grafana-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("http", 3000, "http") + servicePort.withNodePort(30902)) + - service.mixin.spec.withType("NodePort"), - "prometheus-k8s/kube-controller-manager-prometheus-discovery-service.yaml": controllerManagerService, - "prometheus-k8s/kube-scheduler-prometheus-discovery-service.yaml": schedulerService, - "prometheus-k8s/kube-dns-prometheus-discovery-service.yaml": kubeDNSService, - }; - -{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/example-dist/kubeadm/.gitignore b/example-dist/kubeadm/.gitignore deleted file mode 100644 index 4ea90de6..00000000 --- a/example-dist/kubeadm/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -tmp/ -manifests/ diff --git a/example-dist/kubeadm/kube-prometheus.jsonnet b/example-dist/kubeadm/kube-prometheus.jsonnet deleted file mode 100644 index 50ce1020..00000000 --- a/example-dist/kubeadm/kube-prometheus.jsonnet +++ /dev/null @@ -1,31 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; -local kubePrometheus = import "kube-prometheus.libsonnet"; - -local namespace = "monitoring"; - -local controllerManagerService = service.new("kube-controller-manager-prometheus-discovery", {component: "kube-controller-manager"}, servicePort.newNamed("http-metrics", 10252, 10252)) + - service.mixin.metadata.withNamespace("kube-system") + - service.mixin.metadata.withLabels({"k8s-app": "kube-controller-manager"}); - -local schedulerService = service.new("kube-scheduler-prometheus-discovery", {component: "kube-scheduler"}, servicePort.newNamed("http-metrics", 10251, 10251)) + - service.mixin.metadata.withNamespace("kube-system") + - service.mixin.metadata.withLabels({"k8s-app": "kube-scheduler"}); - -local objects = kubePrometheus.new(namespace) + - { - "prometheus-k8s/prometheus-k8s-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("web", 9090, "web") + servicePort.withNodePort(30900)) + - service.mixin.spec.withType("NodePort"), - "alertmanager-main/alertmanager-main-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("web", 9093, "web") + servicePort.withNodePort(30903)) + - service.mixin.spec.withType("NodePort"), - "grafana/grafana-service.yaml"+: - service.mixin.spec.withPorts(servicePort.newNamed("http", 3000, "http") + servicePort.withNodePort(30902)) + - service.mixin.spec.withType("NodePort"), - "prometheus-k8s/kube-controller-manager-prometheus-discovery-service.yaml": controllerManagerService, - "prometheus-k8s/kube-scheduler-prometheus-discovery-service.yaml": schedulerService, - }; - -{[path]: std.manifestYamlDoc(objects[path]) for path in std.objectFields(objects)} diff --git a/examples/bootkube.jsonnet b/examples/bootkube.jsonnet new file mode 100644 index 00000000..89a7eb7b --- /dev/null +++ b/examples/bootkube.jsonnet @@ -0,0 +1,2 @@ +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-bootkube.libsonnet") diff --git a/examples/ksonnet-example.jsonnet b/examples/ksonnet-example.jsonnet new file mode 100644 index 00000000..e83ceaf0 --- /dev/null +++ b/examples/ksonnet-example.jsonnet @@ -0,0 +1,9 @@ +local k = import "ksonnet/ksonnet.beta.3/k.libsonnet"; +local daemonset = k.apps.v1beta2.daemonSet; + +((import "kube-prometheus/kube-prometheus.libsonnet") + { + nodeExporter+: { + daemonset+: + daemonset.mixin.metadata.withNamespace("my-custom-namespace") + } +}).nodeExporter.daemonset diff --git a/examples/kubeadm.jsonnet b/examples/kubeadm.jsonnet new file mode 100644 index 00000000..591809eb --- /dev/null +++ b/examples/kubeadm.jsonnet @@ -0,0 +1,2 @@ +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-kubeadm.libsonnet") diff --git a/examples/node-ports.jsonnet b/examples/node-ports.jsonnet new file mode 100644 index 00000000..68731676 --- /dev/null +++ b/examples/node-ports.jsonnet @@ -0,0 +1,2 @@ +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-node-ports.libsonnet") diff --git a/examples/prometheus-name-override.jsonnet b/examples/prometheus-name-override.jsonnet new file mode 100644 index 00000000..d6410fd8 --- /dev/null +++ b/examples/prometheus-name-override.jsonnet @@ -0,0 +1,9 @@ +((import "kube-prometheus/kube-prometheus.libsonnet") + { + prometheus+: { + prometheus+: { + metadata+: { + name: "my-name", + } + } + } +}).prometheus.prometheus diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 9e1b4881..6be94e01 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -8,11 +8,11 @@ set -u # print each command before executing it set -x -manifest_prefix=${1-.} +manifest_prefix=${1-./manifests} kubectl create namespace monitoring -find ${manifest_prefix}/manifests/prometheus-operator/ -type f ! -name prometheus-operator-service-monitor.yaml -exec kubectl apply -f {} \; +find ${manifest_prefix}/prometheus-operator/ -type f ! -name service-monitor.yaml -exec kubectl apply -f {} \; # Wait for CRDs to be ready. printf "Waiting for Operator to register custom resource definitions..." @@ -25,14 +25,14 @@ until kubectl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep echo "done!" # need to ensure that ServiceMonitors are registered before we can create the prometheus-operator ServiceMonitor -kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/prometheus-operator-service-monitor.yaml +kubectl apply -f ${manifest_prefix}/prometheus-operator/service-monitor.yaml -kubectl apply -f ${manifest_prefix}/manifests/node-exporter/ -kubectl apply -f ${manifest_prefix}/manifests/kube-state-metrics/ -find ${manifest_prefix}/manifests/grafana/ -type f ! -name grafana-dashboard-definitions.yaml -exec kubectl apply -f {} \; +kubectl apply -f ${manifest_prefix}/node-exporter/ +kubectl apply -f ${manifest_prefix}/kube-state-metrics/ +find ${manifest_prefix}/grafana/ -type f ! -name dashboard-definitions.yaml -exec kubectl apply -f {} \; # kubectl apply wants to put the previous version in an annotation, which is too large, therefore create instead of apply -kubectl create -f ${manifest_prefix}/manifests/grafana/grafana-dashboard-definitions.yaml -kubectl apply -f ${manifest_prefix}/manifests/prometheus-k8s/ -kubectl apply -f ${manifest_prefix}/manifests/alertmanager-main/ +kubectl create -f ${manifest_prefix}/grafana/dashboard-definitions.yaml +kubectl apply -f ${manifest_prefix}/prometheus/ +kubectl apply -f ${manifest_prefix}/alertmanager/ diff --git a/hack/scripts/build-jsonnet.sh b/hack/scripts/build-jsonnet.sh index 7189962f..7a754e6c 100755 --- a/hack/scripts/build-jsonnet.sh +++ b/hack/scripts/build-jsonnet.sh @@ -8,12 +8,7 @@ json="tmp/manifests.json" rm -rf ${prefix} mkdir -p $(dirname "${json}") -jsonnet \ - -J $GOPATH/src/github.com/ksonnet/ksonnet-lib \ - -J $GOPATH/src/github.com/grafana/grafonnet-lib \ - -J $GOPATH/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet \ - -J $GOPATH/src/github.com/brancz/kubernetes-grafana/src/kubernetes-jsonnet \ - ${jsonnet} > ${json} +jsonnet -J jsonnet/kube-prometheus/vendor -J jsonnet ${jsonnet} > ${json} files=$(jq -r 'keys[]' ${json}) @@ -21,5 +16,6 @@ for file in ${files}; do dir=$(dirname "${file}") path="${prefix}/${dir}" mkdir -p ${path} - jq -r ".[\"${file}\"]" ${json} | gojsontoyaml -yamltojson | gojsontoyaml > "${prefix}/${file}" + fullfile=$(echo ${file} | sed -r 's/([a-z0-9])([A-Z])/\1-\L\2/g' | tr '[:upper:]' '[:lower:]') + jq -r ".[\"${file}\"]" ${json} | gojsontoyaml -yamltojson | gojsontoyaml > "${prefix}/${fullfile}" done diff --git a/hack/scripts/kube-prometheus-base.jsonnet b/hack/scripts/kube-prometheus-base.jsonnet new file mode 100644 index 00000000..84eb3c29 --- /dev/null +++ b/hack/scripts/kube-prometheus-base.jsonnet @@ -0,0 +1,12 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +}; + +{ ['0prometheus-operator-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name + '.yaml']: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name + '.yaml']: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name + '.yaml']: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name + '.yaml']: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana) } diff --git a/hack/scripts/kube-prometheus-minikube.jsonnet b/hack/scripts/kube-prometheus-minikube.jsonnet new file mode 100644 index 00000000..9a6fff17 --- /dev/null +++ b/hack/scripts/kube-prometheus-minikube.jsonnet @@ -0,0 +1,16 @@ +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + }; + +{ ['0prometheus-operator-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name + '.yaml']: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name + '.yaml']: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name + '.yaml']: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name + '.yaml']: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/alertmanager/alertmanager-main-secret.libsonnet b/jsonnet/alertmanager/alertmanager-main-secret.libsonnet deleted file mode 100644 index a8f9011b..00000000 --- a/jsonnet/alertmanager/alertmanager-main-secret.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local secret = k.core.v1.secret; - -{ - new(namespace, plainConfig):: - secret.new("alertmanager-main", {"alertmanager.yaml": std.base64(plainConfig)}) + - secret.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet b/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet deleted file mode 100644 index 89ca2f80..00000000 --- a/jsonnet/alertmanager/alertmanager-main-service-account.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local serviceAccount = k.core.v1.serviceAccount; - -{ - new(namespace):: - serviceAccount.new("alertmanager-main") + - serviceAccount.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet b/jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet deleted file mode 100644 index 5f13a2b4..00000000 --- a/jsonnet/alertmanager/alertmanager-main-service-monitor.libsonnet +++ /dev/null @@ -1,32 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "alertmanager", - "namespace": namespace, - "labels": { - "k8s-app": "alertmanager" - } - }, - "spec": { - "selector": { - "matchLabels": { - "alertmanager": "main" - } - }, - "namespaceSelector": { - "matchNames": [ - "monitoring" - ] - }, - "endpoints": [ - { - "port": "web", - "interval": "30s" - } - ] - } - } -} diff --git a/jsonnet/alertmanager/alertmanager-main-service.libsonnet b/jsonnet/alertmanager/alertmanager-main-service.libsonnet deleted file mode 100644 index e89f009f..00000000 --- a/jsonnet/alertmanager/alertmanager-main-service.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -local alertmanagerPort = servicePort.newNamed("web", 9093, "web"); - -{ - new(namespace):: - service.new("alertmanager-main", {app: "alertmanager", alertmanager: "main"}, alertmanagerPort) + - service.mixin.metadata.withNamespace(namespace) + - service.mixin.metadata.withLabels({alertmanager: "main"}) -} diff --git a/jsonnet/alertmanager/alertmanager-main.libsonnet b/jsonnet/alertmanager/alertmanager-main.libsonnet deleted file mode 100644 index 63e06a16..00000000 --- a/jsonnet/alertmanager/alertmanager-main.libsonnet +++ /dev/null @@ -1,19 +0,0 @@ -{ - new(namespace):: - { - apiVersion: "monitoring.coreos.com/v1", - kind: "Alertmanager", - metadata: { - name: "main", - namespace: namespace, - labels: { - alertmanager: "main", - }, - }, - spec: { - replicas: 3, - version: "v0.14.0", - serviceAccountName: "alertmanager-main", - }, - } -} diff --git a/jsonnet/alertmanager/alertmanager.libsonnet b/jsonnet/alertmanager/alertmanager.libsonnet deleted file mode 100644 index ec3954c3..00000000 --- a/jsonnet/alertmanager/alertmanager.libsonnet +++ /dev/null @@ -1,7 +0,0 @@ -{ - config:: import "alertmanager-main-secret.libsonnet", - serviceAccount:: import "alertmanager-main-service-account.libsonnet", - service:: import "alertmanager-main-service.libsonnet", - serviceMonitor:: import "alertmanager-main-service-monitor.libsonnet", - alertmanager:: import "alertmanager-main.libsonnet", -} diff --git a/jsonnet/kube-prometheus.libsonnet b/jsonnet/kube-prometheus.libsonnet deleted file mode 100644 index 14864056..00000000 --- a/jsonnet/kube-prometheus.libsonnet +++ /dev/null @@ -1,85 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; - -local alertmanager = import "alertmanager/alertmanager.libsonnet"; -local ksm = import "kube-state-metrics/kube-state-metrics.libsonnet"; -local nodeExporter = import "node-exporter/node-exporter.libsonnet"; -local po = import "prometheus-operator/prometheus-operator.libsonnet"; -local prometheus = import "prometheus/prometheus.libsonnet"; -local grafana = import "grafana/grafana.libsonnet"; - -local alertmanagerConfig = importstr "../assets/alertmanager/alertmanager.yaml"; - -local ruleFiles = { - "alertmanager.rules.yaml": importstr "../assets/prometheus/rules/alertmanager.rules.yaml", - "etcd3.rules.yaml": importstr "../assets/prometheus/rules/etcd3.rules.yaml", - "general.rules.yaml": importstr "../assets/prometheus/rules/general.rules.yaml", - "kube-controller-manager.rules.yaml": importstr "../assets/prometheus/rules/kube-controller-manager.rules.yaml", - "kube-scheduler.rules.yaml": importstr "../assets/prometheus/rules/kube-scheduler.rules.yaml", - "kube-state-metrics.rules.yaml": importstr "../assets/prometheus/rules/kube-state-metrics.rules.yaml", - "kubelet.rules.yaml": importstr "../assets/prometheus/rules/kubelet.rules.yaml", - "kubernetes.rules.yaml": importstr "../assets/prometheus/rules/kubernetes.rules.yaml", - "node.rules.yaml": importstr "../assets/prometheus/rules/node.rules.yaml", - "prometheus.rules.yaml": importstr "../assets/prometheus/rules/prometheus.rules.yaml", -}; - -{ - new(namespace):: - { - "grafana/grafana-dashboard-definitions.yaml": grafana.dashboardDefinitions.new(namespace), - "grafana/grafana-dashboard-sources.yaml": grafana.dashboardSources.new(namespace), - "grafana/grafana-datasources.yaml": grafana.dashboardDatasources.new(namespace), - "grafana/grafana-deployment.yaml": grafana.deployment.new(namespace), - "grafana/grafana-service-account.yaml": grafana.serviceAccount.new(namespace), - "grafana/grafana-service.yaml": grafana.service.new(namespace), - - "alertmanager-main/alertmanager-main-secret.yaml": alertmanager.config.new(namespace, alertmanagerConfig), - "alertmanager-main/alertmanager-main-service-account.yaml": alertmanager.serviceAccount.new(namespace), - "alertmanager-main/alertmanager-main-service.yaml": alertmanager.service.new(namespace), - "alertmanager-main/alertmanager-main-service-monitor.yaml": alertmanager.serviceMonitor.new(namespace), - "alertmanager-main/alertmanager-main.yaml": alertmanager.alertmanager.new(namespace), - - "kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml": ksm.clusterRoleBinding.new(namespace), - "kube-state-metrics/kube-state-metrics-cluster-role.yaml": ksm.clusterRole.new(), - "kube-state-metrics/kube-state-metrics-deployment.yaml": ksm.deployment.new(namespace), - "kube-state-metrics/kube-state-metrics-role-binding.yaml": ksm.roleBinding.new(namespace), - "kube-state-metrics/kube-state-metrics-role.yaml": ksm.role.new(namespace), - "kube-state-metrics/kube-state-metrics-service-account.yaml": ksm.serviceAccount.new(namespace), - "kube-state-metrics/kube-state-metrics-service.yaml": ksm.service.new(namespace), - "kube-state-metrics/kube-state-metrics-service-monitor.yaml": ksm.serviceMonitor.new(namespace), - - "node-exporter/node-exporter-cluster-role-binding.yaml": nodeExporter.clusterRoleBinding.new(namespace), - "node-exporter/node-exporter-cluster-role.yaml": nodeExporter.clusterRole.new(), - "node-exporter/node-exporter-daemonset.yaml": nodeExporter.daemonset.new(namespace), - "node-exporter/node-exporter-service-account.yaml": nodeExporter.serviceAccount.new(namespace), - "node-exporter/node-exporter-service.yaml": nodeExporter.service.new(namespace), - "node-exporter/node-exporter-service-monitor.yaml": nodeExporter.serviceMonitor.new(namespace), - - "prometheus-operator/prometheus-operator-cluster-role-binding.yaml": po.clusterRoleBinding.new(namespace), - "prometheus-operator/prometheus-operator-cluster-role.yaml": po.clusterRole.new(), - "prometheus-operator/prometheus-operator-deployment.yaml": po.deployment.new(namespace), - "prometheus-operator/prometheus-operator-service.yaml": po.service.new(namespace), - "prometheus-operator/prometheus-operator-service-monitor.yaml": po.serviceMonitor.new(namespace), - "prometheus-operator/prometheus-operator-service-account.yaml": po.serviceAccount.new(namespace), - - "prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml": prometheus.clusterRoleBinding.new(namespace), - "prometheus-k8s/prometheus-k8s-cluster-role.yaml": prometheus.clusterRole.new(), - "prometheus-k8s/prometheus-k8s-service-account.yaml": prometheus.serviceAccount.new(namespace), - "prometheus-k8s/prometheus-k8s-service.yaml": prometheus.service.new(namespace), - "prometheus-k8s/prometheus-k8s.yaml": prometheus.prometheus.new(namespace), - "prometheus-k8s/prometheus-k8s-rules.yaml": prometheus.rules.new(namespace, ruleFiles), - "prometheus-k8s/prometheus-k8s-role-binding-config.yaml": prometheus.roleBindingConfig.new(namespace), - "prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml": prometheus.roleBindingNamespace.new(namespace), - "prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml": prometheus.roleBindingKubeSystem.new(namespace), - "prometheus-k8s/prometheus-k8s-role-binding-default.yaml": prometheus.roleBindingDefault.new(namespace), - "prometheus-k8s/prometheus-k8s-role-config.yaml": prometheus.roleConfig.new(namespace), - "prometheus-k8s/prometheus-k8s-role-namespace.yaml": prometheus.roleNamespace.new(namespace), - "prometheus-k8s/prometheus-k8s-role-kube-system.yaml": prometheus.roleKubeSystem.new(), - "prometheus-k8s/prometheus-k8s-role-default.yaml": prometheus.roleDefault.new(), - "prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml": prometheus.serviceMonitorApiserver.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml": prometheus.serviceMonitorCoreDNS.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml": prometheus.serviceMonitorControllerManager.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml": prometheus.serviceMonitorScheduler.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml": prometheus.serviceMonitorKubelet.new(namespace), - "prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml": prometheus.serviceMonitorPrometheus.new(namespace), - } -} diff --git a/jsonnet/kube-prometheus/.gitignore b/jsonnet/kube-prometheus/.gitignore new file mode 100644 index 00000000..52a75ecb --- /dev/null +++ b/jsonnet/kube-prometheus/.gitignore @@ -0,0 +1,2 @@ +jsonnetfile.lock.json +vendor/ diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet new file mode 100644 index 00000000..f4634703 --- /dev/null +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -0,0 +1,97 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n routes:\n - match:\n alertname: DeadMansSwitch\n receiver: 'null'\nreceivers:\n- name: 'null'\n"; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + alertmanager: 'v0.14.0', + }, + + imageRepos+:: { + alertmanager: 'quay.io/prometheus/alertmanager', + }, + + alertmanager+:: { + config: alertmanagerConfig, + replicas: 3, + }, + }, + + alertmanager+:: { + secret: + local secret = k.core.v1.secret; + + secret.new('alertmanager-main', { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + + secret.mixin.metadata.withNamespace($._config.namespace), + + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new('alertmanager-main') + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local alertmanagerPort = servicePort.newNamed('web', 9093, 'web'); + + service.new('alertmanager-main', { app: 'alertmanager', alertmanager: 'main' }, alertmanagerPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ alertmanager: 'main' }), + + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'alertmanager', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'alertmanager', + }, + }, + spec: { + selector: { + matchLabels: { + alertmanager: 'main', + }, + }, + namespaceSelector: { + matchNames: [ + 'monitoring', + ], + }, + endpoints: [ + { + port: 'web', + interval: '30s', + }, + ], + }, + }, + + alertmanager: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Alertmanager', + metadata: { + name: 'main', + namespace: $._config.namespace, + labels: { + alertmanager: 'main', + }, + }, + spec: { + replicas: $._config.alertmanager.replicas, + version: $._config.versions.alertmanager, + baseImage: $._config.imageRepos.alertmanager, + nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, + serviceAccountName: 'alertmanager-main', + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json new file mode 100644 index 00000000..f9014406 --- /dev/null +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -0,0 +1,34 @@ +{ + "dependencies": [ + { + "name": "ksonnet", + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib", + "subdir": "" + } + }, + "version": "master" + }, + { + "name": "kubernetes-mixin", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "" + } + }, + "version": "master" + }, + { + "name": "grafana", + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana", + "subdir": "grafana" + } + }, + "version": "master" + } + ] +} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet new file mode 100644 index 00000000..a9cf3bb3 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet @@ -0,0 +1,23 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+:: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + kubeDnsPrometheusDiscoveryService: + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + + service.mixin.spec.withClusterIp('None'), + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet new file mode 100644 index 00000000..664e1912 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet @@ -0,0 +1,8 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet'); + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet new file mode 100644 index 00000000..a249d1db --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet @@ -0,0 +1,18 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet new file mode 100644 index 00000000..48df7478 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet @@ -0,0 +1,21 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + service+: + service.mixin.spec.withPorts(servicePort.newNamed('web', 9090, 'web') + servicePort.withNodePort(30900)) + + service.mixin.spec.withType('NodePort'), + }, + alertmanager+: { + service+: + service.mixin.spec.withPorts(servicePort.newNamed('web', 9093, 'web') + servicePort.withNodePort(30903)) + + service.mixin.spec.withType('NodePort'), + }, + grafana+: { + service+: + service.mixin.spec.withPorts(servicePort.newNamed('http', 3000, 'http') + servicePort.withNodePort(30902)) + + service.mixin.spec.withType('NodePort'), + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet new file mode 100644 index 00000000..36eae76d --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -0,0 +1,26 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +(import 'grafana/grafana.libsonnet') + +(import 'kube-state-metrics/kube-state-metrics.libsonnet') + +(import 'node-exporter/node-exporter.libsonnet') + +(import 'alertmanager/alertmanager.libsonnet') + +(import 'prometheus-operator/prometheus-operator.libsonnet') + +(import 'prometheus/prometheus.libsonnet') + +(import 'kubernetes-mixin/mixin.libsonnet') + +{ + _config+:: { + kubeStateMetricsSelector: 'job="kube-state-metrics"', + cadvisorSelector: 'job="kubelet"', + nodeExporterSelector: 'job="node-exporter"', + kubeletSelector: 'job="kubelet"', + notKubeDnsSelector: 'job!="kube-dns"', + + prometheus+:: { + rules: $.prometheusRules + $.prometheusAlerts, + }, + + grafana+:: { + dashboards: $.grafanaDashboards, + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet new file mode 100644 index 00000000..c73c16ab --- /dev/null +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -0,0 +1,286 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + kubeStateMetrics: 'v1.3.0', + kubeRbacProxy: 'v0.3.0', + addonResizer: '1.0', + }, + + imageRepos+:: { + kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', + kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', + addonResizer: 'quay.io/coreos/addon-resizer', + }, + }, + + kubeStateMetrics+:: { + clusterRoleBinding: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName('kube-state-metrics') + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName('kube-state-metrics') + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'kube-state-metrics', namespace: $._config.namespace }]), + + clusterRole: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'configmaps', + 'secrets', + 'nodes', + 'pods', + 'services', + 'resourcequotas', + 'replicationcontrollers', + 'limitranges', + 'persistentvolumeclaims', + 'persistentvolumes', + 'namespaces', + 'endpoints', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local extensionsRule = policyRule.new() + + policyRule.withApiGroups(['extensions']) + + policyRule.withResources([ + 'daemonsets', + 'deployments', + 'replicasets', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local appsRule = policyRule.new() + + policyRule.withApiGroups(['apps']) + + policyRule.withResources([ + 'statefulsets', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local batchRule = policyRule.new() + + policyRule.withApiGroups(['batch']) + + policyRule.withResources([ + 'cronjobs', + 'jobs', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local autoscalingRule = policyRule.new() + + policyRule.withApiGroups(['autoscaling']) + + policyRule.withResources([ + 'horizontalpodautoscalers', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local authenticationRole = policyRule.new() + + policyRule.withApiGroups(['authentication.k8s.io']) + + policyRule.withResources([ + 'tokenreviews', + ]) + + policyRule.withVerbs(['create']); + + local authorizationRole = policyRule.new() + + policyRule.withApiGroups(['authorization.k8s.io']) + + policyRule.withResources([ + 'subjectaccessreviews', + ]) + + policyRule.withVerbs(['create']); + + local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole]; + + clusterRole.new() + + clusterRole.mixin.metadata.withName('kube-state-metrics') + + clusterRole.withRules(rules), + deployment: + local deployment = k.apps.v1beta2.deployment; + local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; + local volume = k.apps.v1beta2.deployment.mixin.spec.template.spec.volumesType; + local containerPort = container.portsType; + local containerVolumeMount = container.volumeMountsType; + local podSelector = deployment.mixin.spec.template.spec.selectorType; + + local podLabels = { app: 'kube-state-metrics' }; + + local proxyClusterMetrics = + container.new('kube-rbac-proxy-main', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + + container.withArgs([ + '--secure-listen-address=:8443', + '--upstream=http://127.0.0.1:8081/', + ]) + + container.withPorts(containerPort.newNamed('https-main', 8443)) + + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); + + local proxySelfMetrics = + container.new('kube-rbac-proxy-self', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + + container.withArgs([ + '--secure-listen-address=:9443', + '--upstream=http://127.0.0.1:8082/', + ]) + + container.withPorts(containerPort.newNamed('https-self', 9443)) + + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); + + local kubeStateMetrics = + container.new('kube-state-metrics', $._config.imageRepos.kubeStateMetrics + ':' + $._config.versions.kubeStateMetrics) + + container.withArgs([ + '--host=127.0.0.1', + '--port=8081', + '--telemetry-host=127.0.0.1', + '--telemetry-port=8082', + ]) + + container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + + container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + + local addonResizer = + container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + + container.withCommand([ + '/pod_nanny', + '--container=kube-state-metrics', + '--cpu=100m', + '--extra-cpu=2m', + '--memory=150Mi', + '--extra-memory=30Mi', + '--threshold=5', + '--deployment=kube-state-metrics', + ]) + + container.withEnv([ + { + name: 'MY_POD_NAME', + valueFrom: { + fieldRef: { apiVersion: 'v1', fieldPath: 'metadata.name' }, + }, + }, + { + name: 'MY_POD_NAMESPACE', + valueFrom: { + fieldRef: { apiVersion: 'v1', fieldPath: 'metadata.namespace' }, + }, + }, + ]) + + container.mixin.resources.withRequests({ cpu: '10m', memory: '30Mi' }) + + container.mixin.resources.withLimits({ cpu: '10m', memory: '30Mi' }); + + local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics, addonResizer]; + + deployment.new('kube-state-metrics', 1, c, podLabels) + + deployment.mixin.metadata.withNamespace($._config.namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels) + + deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + + deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + deployment.mixin.spec.template.spec.withServiceAccountName('kube-state-metrics'), + + roleBinding: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('kube-state-metrics') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('kube-state-metrics') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'kube-state-metrics' }]), + + role: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'pods', + ]) + + policyRule.withVerbs(['get']); + + local extensionsRule = policyRule.new() + + policyRule.withApiGroups(['extensions']) + + policyRule.withResources([ + 'deployments', + ]) + + policyRule.withVerbs(['get', 'update']) + + policyRule.withResourceNames(['kube-state-metrics']); + + local rules = [coreRule, extensionsRule]; + + role.new() + + role.mixin.metadata.withName('kube-state-metrics') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRules(rules), + + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new('kube-state-metrics') + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local ksmServicePortMain = servicePort.newNamed('https-main', 8443, 'https-main'); + local ksmServicePortSelf = servicePort.newNamed('https-self', 9443, 'https-self'); + + service.new('kube-state-metrics', $.kubeStateMetrics.deployment.spec.selector.matchLabels, [ksmServicePortMain, ksmServicePortSelf]) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-state-metrics' }) + + service.mixin.spec.withClusterIp('None'), + + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-state-metrics', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'kube-state-metrics', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'kube-state-metrics', + }, + }, + namespaceSelector: { + matchNames: [ + 'monitoring', + ], + }, + endpoints: [ + { + port: 'https-main', + scheme: 'https', + interval: '30s', + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + { + port: 'https-self', + scheme: 'https', + interval: '30s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet new file mode 100644 index 00000000..d232d920 --- /dev/null +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -0,0 +1,167 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + nodeExporter: 'v0.15.2', + kubeRbacProxy: 'v0.3.0', + }, + + imageRepos+:: { + nodeExporter: 'quay.io/prometheus/node-exporter', + kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', + }, + }, + + nodeExporter+:: { + clusterRoleBinding: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName('node-exporter') + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName('node-exporter') + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter', namespace: $._config.namespace }]), + + clusterRole: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local authenticationRole = policyRule.new() + + policyRule.withApiGroups(['authentication.k8s.io']) + + policyRule.withResources([ + 'tokenreviews', + ]) + + policyRule.withVerbs(['create']); + + local authorizationRole = policyRule.new() + + policyRule.withApiGroups(['authorization.k8s.io']) + + policyRule.withResources([ + 'subjectaccessreviews', + ]) + + policyRule.withVerbs(['create']); + + local rules = [authenticationRole, authorizationRole]; + + clusterRole.new() + + clusterRole.mixin.metadata.withName('node-exporter') + + clusterRole.withRules(rules), + + daemonset: + local daemonset = k.apps.v1beta2.daemonSet; + local container = daemonset.mixin.spec.template.spec.containersType; + local volume = daemonset.mixin.spec.template.spec.volumesType; + local containerPort = container.portsType; + local containerVolumeMount = container.volumeMountsType; + local podSelector = daemonset.mixin.spec.template.spec.selectorType; + local toleration = daemonset.mixin.spec.template.spec.tolerationsType; + + local podLabels = { app: 'node-exporter' }; + + local masterToleration = toleration.new() + + toleration.withEffect('NoSchedule') + + toleration.withKey('node-role.kubernetes.io/master'); + + local procVolumeName = 'proc'; + local procVolume = volume.fromHostPath(procVolumeName, '/proc'); + local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'); + + local sysVolumeName = 'sys'; + local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); + local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'); + + local nodeExporter = + container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + + container.withArgs([ + '--web.listen-address=127.0.0.1:9101', + '--path.procfs=/host/proc', + '--path.sysfs=/host/sys', + ]) + + container.withVolumeMounts([procVolumeMount, sysVolumeMount]) + + container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + + container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + + local proxy = + container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + + container.withArgs([ + '--secure-listen-address=:9100', + '--upstream=http://127.0.0.1:9101/', + ]) + + container.withPorts(containerPort.newNamed('https', 9100)) + + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); + + local c = [nodeExporter, proxy]; + + daemonset.new() + + daemonset.mixin.metadata.withName('node-exporter') + + daemonset.mixin.metadata.withNamespace($._config.namespace) + + daemonset.mixin.metadata.withLabels(podLabels) + + daemonset.mixin.spec.selector.withMatchLabels(podLabels) + + daemonset.mixin.spec.template.metadata.withLabels(podLabels) + + daemonset.mixin.spec.template.spec.withTolerations([masterToleration]) + + daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + + daemonset.mixin.spec.template.spec.withContainers(c) + + daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume]) + + daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter'), + + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new('node-exporter') + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'node-exporter', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'node-exporter', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'node-exporter', + }, + }, + namespaceSelector: { + matchNames: [ + 'monitoring', + ], + }, + endpoints: [ + { + port: 'https', + scheme: 'https', + interval: '30s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local nodeExporterPort = servicePort.newNamed('https', 9100, 'https'); + + service.new('node-exporter', $.nodeExporter.daemonset.spec.selector.matchLabels, nodeExporterPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ 'k8s-app': 'node-exporter' }) + + service.mixin.spec.withClusterIp('None'), + }, +} diff --git a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet new file mode 100644 index 00000000..1970adc7 --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet @@ -0,0 +1 @@ +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifer to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is alpha in 1.8 and can be reworked or removed in a future release.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet new file mode 100644 index 00000000..d6b13ad9 --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet @@ -0,0 +1 @@ +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifer to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is alpha in 1.8 and can be reworked or removed in a future release.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet new file mode 100644 index 00000000..c1003ebf --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -0,0 +1,152 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + prometheusOperator: 'v0.19.0', + }, + + imageRepos+:: { + prometheusOperator: 'quay.io/coreos/prometheus-operator', + }, + }, + + prometheusOperator+:: { + // Prefixing with 0 to ensure these manifests are listed and therefore created first. + '0alertmanagerCustomResourceDefinition': import 'alertmanager-crd.libsonnet', + '0prometheusCustomResourceDefinition': import 'prometheus-crd.libsonnet', + '0servicemonitorCustomResourceDefinition': import 'servicemonitor-crd.libsonnet', + + clusterRoleBinding: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName('prometheus-operator') + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName('prometheus-operator') + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-operator', namespace: $._config.namespace }]), + + clusterRole: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local extensionsRule = policyRule.new() + + policyRule.withApiGroups(['extensions']) + + policyRule.withResources([ + 'thirdpartyresources', + ]) + + policyRule.withVerbs(['*']); + + local apiExtensionsRule = policyRule.new() + + policyRule.withApiGroups(['apiextensions.k8s.io']) + + policyRule.withResources([ + 'customresourcedefinitions', + ]) + + policyRule.withVerbs(['*']); + + local monitoringRule = policyRule.new() + + policyRule.withApiGroups(['monitoring.coreos.com']) + + policyRule.withResources([ + 'alertmanagers', + 'prometheuses', + 'prometheuses/finalizers', + 'alertmanagers/finalizers', + 'servicemonitors', + ]) + + policyRule.withVerbs(['*']); + + local appsRule = policyRule.new() + + policyRule.withApiGroups(['apps']) + + policyRule.withResources([ + 'statefulsets', + ]) + + policyRule.withVerbs(['*']); + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'configmaps', + 'secrets', + ]) + + policyRule.withVerbs(['*']); + + local podRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'pods', + ]) + + policyRule.withVerbs(['list', 'delete']); + + local routingRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'services', + 'endpoints', + ]) + + policyRule.withVerbs(['get', 'create', 'update']); + + local nodeRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'nodes', + ]) + + policyRule.withVerbs(['list', 'watch']); + + local namespaceRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'namespaces', + ]) + + policyRule.withVerbs(['list']); + + local rules = [extensionsRule, apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; + + clusterRole.new() + + clusterRole.mixin.metadata.withName('prometheus-operator') + + clusterRole.withRules(rules), + + deployment: + local deployment = k.apps.v1beta2.deployment; + local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; + local containerPort = container.portsType; + + local targetPort = 8080; + local podLabels = { 'k8s-app': 'prometheus-operator' }; + + local operatorContainer = + container.new('prometheus-operator', $._config.imageRepos.prometheusOperator + ':' + $._config.versions.prometheusOperator) + + container.withPorts(containerPort.newNamed('http', targetPort)) + + container.withArgs(['--kubelet-service=kube-system/kubelet', '--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1']) + + container.mixin.resources.withRequests({ cpu: '100m', memory: '50Mi' }) + + container.mixin.resources.withLimits({ cpu: '200m', memory: '100Mi' }); + + deployment.new('prometheus-operator', 1, operatorContainer, podLabels) + + deployment.mixin.metadata.withNamespace($._config.namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels) + + deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + + deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + + deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + + deployment.mixin.spec.template.spec.withServiceAccountName('prometheus-operator'), + + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new('prometheus-operator') + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local poServicePort = servicePort.newNamed('http', 8080, 'http'); + + service.new('prometheus-operator', $.prometheusOperator.deployment.spec.selector.matchLabels, [poServicePort]) + + service.mixin.metadata.withLabels({ 'k8s-app': 'prometheus-operator' }) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.spec.withClusterIp('None'), + }, +} diff --git a/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet new file mode 100644 index 00000000..1df3123f --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet @@ -0,0 +1 @@ +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"servicemonitors.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"ServiceMonitor","plural":"servicemonitors"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"ServiceMonitor defines monitoring for a set of services.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"ServiceMonitorSpec contains specification parameters for a ServiceMonitor.","properties":{"endpoints":{"description":"A list of endpoints allowed as part of this ServiceMonitor.","items":{"description":"Endpoint defines a scrapeable endpoint serving Prometheus metrics.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerTokenFile":{"description":"File to read bearer token for scraping targets.","type":"string"},"honorLabels":{"description":"HonorLabels chooses the metric's labels on collisions with target labels.","type":"boolean"},"interval":{"description":"Interval at which metrics should be scraped","type":"string"},"metricRelabelings":{"description":"MetricRelabelConfigs to apply to samples before ingestion.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"},"params":{"description":"Optional HTTP URL parameters","type":"object"},"path":{"description":"HTTP path to scrape for metrics.","type":"string"},"port":{"description":"Name of the service port this endpoint refers to. Mutually exclusive with targetPort.","type":"string"},"scheme":{"description":"HTTP scheme to use for scraping.","type":"string"},"scrapeTimeout":{"description":"Timeout after which the scrape is ended","type":"string"},"targetPort":{},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}}},"type":"array"},"jobLabel":{"description":"The label to use to retrieve the job name from.","type":"string"},"namespaceSelector":{"description":"A selector for selecting namespaces either selecting all namespaces or a list of namespaces.","properties":{"any":{"description":"Boolean describing whether all namespaces are selected in contrast to a list restricting them.","type":"boolean"},"matchNames":{"description":"List of namespace names.","items":{"type":"string"},"type":"array"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"targetLabels":{"description":"TargetLabels transfers labels on the Kubernetes Service onto the target.","items":{"type":"string"},"type":"array"}},"required":["endpoints","selector"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet new file mode 100644 index 00000000..3b2d415c --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -0,0 +1,454 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + prometheus: 'v2.2.1', + }, + + imageRepos+:: { + prometheus: 'quay.io/prometheus/prometheus', + }, + + prometheus+:: { + replicas: 2, + rules: {}, + }, + }, + + prometheus+:: { + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new('prometheus-k8s') + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local prometheusPort = servicePort.newNamed('web', 9090, 'web'); + + service.new('prometheus-k8s', { app: 'prometheus', prometheus: 'k8s' }, prometheusPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ prometheus: 'k8s' }), + rules: + local configMap = k.core.v1.configMap; + + configMap.new('prometheus-k8s-rules', { 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) }) + + configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: 'k8s' }) + + configMap.mixin.metadata.withNamespace($._config.namespace), + roleBindingDefault: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withNamespace('default') + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + clusterRole: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local nodeMetricsRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources(['nodes/metrics']) + + policyRule.withVerbs(['get']); + + local metricsRule = policyRule.new() + + policyRule.withNonResourceUrls('/metrics') + + policyRule.withVerbs(['get']); + + local rules = [nodeMetricsRule, metricsRule]; + + clusterRole.new() + + clusterRole.mixin.metadata.withName('prometheus-k8s') + + clusterRole.withRules(rules), + roleConfig: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + + local configmapRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'configmaps', + ]) + + policyRule.withVerbs(['get']); + + role.new() + + role.mixin.metadata.withName('prometheus-k8s-config') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRules(configmapRule), + roleBindingConfig: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-k8s-config') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-k8s-config') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleBindingNamespace: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + clusterRoleBinding: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName('prometheus-k8s') + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName('prometheus-k8s') + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleKubeSystem: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'nodes', + 'services', + 'endpoints', + 'pods', + ]) + + policyRule.withVerbs(['get', 'list', 'watch']); + + role.new() + + role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withNamespace('kube-system') + + role.withRules(coreRule), + roleDefault: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'nodes', + 'services', + 'endpoints', + 'pods', + ]) + + policyRule.withVerbs(['get', 'list', 'watch']); + + role.new() + + role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withNamespace('default') + + role.withRules(coreRule), + roleBindingKubeSystem: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withNamespace('kube-system') + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleNamespace: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'nodes', + 'services', + 'endpoints', + 'pods', + ]) + + policyRule.withVerbs(['get', 'list', 'watch']); + + role.new() + + role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRules(coreRule), + prometheus: + local container = k.core.v1.pod.mixin.spec.containersType; + local resourceRequirements = container.mixin.resourcesType; + local selector = k.apps.v1beta2.deployment.mixin.spec.selectorType; + + local resources = resourceRequirements.new() + + resourceRequirements.withRequests({ memory: '400Mi' }); + + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Prometheus', + metadata: { + name: 'k8s', + namespace: $._config.namespace, + labels: { + prometheus: 'k8s', + }, + }, + spec: { + replicas: $._config.prometheus.replicas, + version: $._config.versions.prometheus, + baseImage: $._config.imageRepos.prometheus, + serviceAccountName: 'prometheus-k8s', + serviceMonitorSelector: selector.withMatchExpressions({ key: 'k8s-app', operator: 'Exists' }), + nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, + ruleSelector: selector.withMatchLabels({ + role: 'alert-rules', + prometheus: 'k8s', + }), + resources: resources, + alerting: { + alertmanagers: [ + { + namespace: $._config.namespace, + name: 'alertmanager-main', + port: 'web', + }, + ], + }, + }, + }, + serviceMonitorPrometheus: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'prometheus', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'prometheus', + }, + }, + spec: { + selector: { + matchLabels: { + prometheus: 'k8s', + }, + }, + namespaceSelector: { + matchNames: [ + 'monitoring', + ], + }, + endpoints: [ + { + port: 'web', + interval: '30s', + }, + ], + }, + }, + serviceMonitorPrometheusOperator: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'prometheus-operator', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'prometheus-operator', + }, + }, + spec: { + endpoints: [ + { + port: 'http', + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'prometheus-operator', + }, + }, + }, + }, + serviceMonitorKubeScheduler: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-scheduler', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'kube-scheduler', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'http-metrics', + interval: '30s', + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'kube-scheduler', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + }, + }, + serviceMonitorKubelet: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kubelet', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'kubelet', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'https-metrics', + scheme: 'https', + interval: '30s', + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/cadvisor', + interval: '30s', + honorLabels: true, + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'kubelet', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + }, + }, + serviceMonitorKubeControllerManager: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-controller-manager', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'kube-controller-manager', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'http-metrics', + interval: '30s', + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'kube-controller-manager', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + }, + }, + serviceMonitorApiserver: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-apiserver', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'apiserver', + }, + }, + spec: { + jobLabel: 'component', + selector: { + matchLabels: { + component: 'apiserver', + provider: 'kubernetes', + }, + }, + namespaceSelector: { + matchNames: [ + 'default', + ], + }, + endpoints: [ + { + port: 'https', + interval: '30s', + scheme: 'https', + tlsConfig: { + caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', + serverName: 'kubernetes', + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + }, + }, + serviceMonitorCoreDNS: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'coredns', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'coredns', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'coredns', + component: 'metrics', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'http-metrics', + interval: '15s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet deleted file mode 100644 index ae150c35..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role-binding.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - -{ - new(namespace):: - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName("kube-state-metrics") + - clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - clusterRoleBinding.mixin.roleRef.withName("kube-state-metrics") + - clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + - clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "kube-state-metrics", namespace: namespace}]) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet deleted file mode 100644 index 976d850a..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-cluster-role.libsonnet +++ /dev/null @@ -1,75 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRole = k.rbac.v1.clusterRole; -local policyRule = clusterRole.rulesType; - -local coreRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "configmaps", - "secrets", - "nodes", - "pods", - "services", - "resourcequotas", - "replicationcontrollers", - "limitranges", - "persistentvolumeclaims", - "persistentvolumes", - "namespaces", - "endpoints", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local extensionsRule = policyRule.new() + - policyRule.withApiGroups(["extensions"]) + - policyRule.withResources([ - "daemonsets", - "deployments", - "replicasets", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local appsRule = policyRule.new() + - policyRule.withApiGroups(["apps"]) + - policyRule.withResources([ - "statefulsets", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local batchRule = policyRule.new() + - policyRule.withApiGroups(["batch"]) + - policyRule.withResources([ - "cronjobs", - "jobs", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local autoscalingRule = policyRule.new() + - policyRule.withApiGroups(["autoscaling"]) + - policyRule.withResources([ - "horizontalpodautoscalers", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local authenticationRole = policyRule.new() + - policyRule.withApiGroups(["authentication.k8s.io"]) + - policyRule.withResources([ - "tokenreviews", - ]) + - policyRule.withVerbs(["create"]); - -local authorizationRole = policyRule.new() + - policyRule.withApiGroups(["authorization.k8s.io"]) + - policyRule.withResources([ - "subjectaccessreviews", - ]) + - policyRule.withVerbs(["create"]); - -local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole]; - -{ - new():: - clusterRole.new() + - clusterRole.mixin.metadata.withName("kube-state-metrics") + - clusterRole.withRules(rules) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet deleted file mode 100644 index e873fa30..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-deployment.libsonnet +++ /dev/null @@ -1,86 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local deployment = k.apps.v1beta2.deployment; - -local deployment = k.apps.v1beta2.deployment; -local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; -local volume = k.apps.v1beta2.deployment.mixin.spec.template.spec.volumesType; -local containerPort = container.portsType; -local containerVolumeMount = container.volumeMountsType; -local podSelector = deployment.mixin.spec.template.spec.selectorType; - -local kubeStateMetricsVersion = "v1.3.0"; -local kubeRbacProxyVersion = "v0.3.0"; -local addonResizerVersion = "1.0"; -local podLabels = {"app": "kube-state-metrics"}; - -local proxyClusterMetrics = - container.new("kube-rbac-proxy-main", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + - container.withArgs([ - "--secure-listen-address=:8443", - "--upstream=http://127.0.0.1:8081/", - ]) + - container.withPorts(containerPort.newNamed("https-main", 8443)) + - container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + - container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); - -local proxySelfMetrics = - container.new("kube-rbac-proxy-self", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + - container.withArgs([ - "--secure-listen-address=:9443", - "--upstream=http://127.0.0.1:8082/", - ]) + - container.withPorts(containerPort.newNamed("https-self", 9443)) + - container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + - container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); - -local kubeStateMetrics = - container.new("kube-state-metrics", "quay.io/coreos/kube-state-metrics:" + kubeStateMetricsVersion) + - container.withArgs([ - "--host=127.0.0.1", - "--port=8081", - "--telemetry-host=127.0.0.1", - "--telemetry-port=8082", - ]) + - container.mixin.resources.withRequests({cpu: "102m", memory: "180Mi"}) + - container.mixin.resources.withLimits({cpu: "102m", memory: "180Mi"}); - -local addonResizer = - container.new("addon-resizer", "quay.io/coreos/addon-resizer:" + addonResizerVersion) + - container.withCommand([ - "/pod_nanny", - "--container=kube-state-metrics", - "--cpu=100m", - "--extra-cpu=2m", - "--memory=150Mi", - "--extra-memory=30Mi", - "--threshold=5", - "--deployment=kube-state-metrics", - ]) + - container.withEnv([ - { - name: "MY_POD_NAME", - valueFrom: { - fieldRef: {apiVersion: "v1", fieldPath: "metadata.name"} - } - }, { - name: "MY_POD_NAMESPACE", - valueFrom: { - fieldRef: {apiVersion: "v1", fieldPath: "metadata.namespace"} - } - } - ]) + - container.mixin.resources.withRequests({cpu: "10m", memory: "30Mi"}) + - container.mixin.resources.withLimits({cpu: "10m", memory: "30Mi"}); - -local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics, addonResizer]; - -{ - new(namespace):: - deployment.new("kube-state-metrics", 1, c, podLabels) + - deployment.mixin.metadata.withNamespace(namespace) + - deployment.mixin.metadata.withLabels(podLabels) + - deployment.mixin.spec.selector.withMatchLabels(podLabels) + - deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - deployment.mixin.spec.template.spec.withServiceAccountName("kube-state-metrics") -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet deleted file mode 100644 index 02a43b7a..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-role-binding.libsonnet +++ /dev/null @@ -1,13 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local roleBinding = k.rbac.v1.roleBinding; - -{ - new(namespace):: - roleBinding.new() + - roleBinding.mixin.metadata.withName("kube-state-metrics") + - roleBinding.mixin.metadata.withNamespace(namespace) + - roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - roleBinding.mixin.roleRef.withName("kube-state-metrics") + - roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + - roleBinding.withSubjects([{kind: "ServiceAccount", name: "kube-state-metrics"}]) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet deleted file mode 100644 index bf80880f..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-role.libsonnet +++ /dev/null @@ -1,28 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local role = k.rbac.v1.role; -local policyRule = role.rulesType; - -local coreRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "pods", - ]) + - policyRule.withVerbs(["get"]); - -local extensionsRule = policyRule.new() + - policyRule.withApiGroups(["extensions"]) + - policyRule.withResources([ - "deployments", - ]) + - policyRule.withVerbs(["get", "update"]) + - policyRule.withResourceNames(["kube-state-metrics"]); - -local rules = [coreRule, extensionsRule]; - -{ - new(namespace):: - role.new() + - role.mixin.metadata.withName("kube-state-metrics") + - role.mixin.metadata.withNamespace(namespace) + - role.withRules(rules) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet deleted file mode 100644 index 6e6904ff..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-service-account.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local serviceAccount = k.core.v1.serviceAccount; - -{ - new(namespace):: - serviceAccount.new("kube-state-metrics") + - serviceAccount.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet deleted file mode 100644 index 3d24aec3..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-service-monitor.libsonnet +++ /dev/null @@ -1,48 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "kube-state-metrics", - "namespace": namespace, - "labels": { - "k8s-app": "kube-state-metrics" - } - }, - "spec": { - "jobLabel": "k8s-app", - "selector": { - "matchLabels": { - "k8s-app": "kube-state-metrics" - } - }, - "namespaceSelector": { - "matchNames": [ - "monitoring" - ] - }, - "endpoints": [ - { - "port": "https-main", - "scheme": "https", - "interval": "30s", - "honorLabels": true, - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", - "tlsConfig": { - "insecureSkipVerify": true - } - }, - { - "port": "https-self", - "scheme": "https", - "interval": "30s", - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", - "tlsConfig": { - "insecureSkipVerify": true - } - } - ] - } - } -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet deleted file mode 100644 index c8eaee18..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics-service.libsonnet +++ /dev/null @@ -1,15 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -local ksmDeployment = import "kube-state-metrics-deployment.libsonnet"; - -local ksmServicePortMain = servicePort.newNamed("https-main", 8443, "https-main"); -local ksmServicePortSelf = servicePort.newNamed("https-self", 9443, "https-self"); - -{ - new(namespace):: - service.new("kube-state-metrics", ksmDeployment.new(namespace).spec.selector.matchLabels, [ksmServicePortMain, ksmServicePortSelf]) + - service.mixin.metadata.withNamespace(namespace) + - service.mixin.metadata.withLabels({"k8s-app": "kube-state-metrics"}) -} diff --git a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet deleted file mode 100644 index d82765f6..00000000 --- a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet +++ /dev/null @@ -1,10 +0,0 @@ -{ - clusterRoleBinding:: import "kube-state-metrics-cluster-role-binding.libsonnet", - clusterRole:: import "kube-state-metrics-cluster-role.libsonnet", - deployment:: import "kube-state-metrics-deployment.libsonnet", - roleBinding:: import "kube-state-metrics-role-binding.libsonnet", - role:: import "kube-state-metrics-role.libsonnet", - serviceAccount:: import "kube-state-metrics-service-account.libsonnet", - service:: import "kube-state-metrics-service.libsonnet", - serviceMonitor:: import "kube-state-metrics-service-monitor.libsonnet", -} diff --git a/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet b/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet deleted file mode 100644 index 39f373b5..00000000 --- a/jsonnet/node-exporter/node-exporter-cluster-role-binding.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - -{ - new(namespace):: - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName("node-exporter") + - clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - clusterRoleBinding.mixin.roleRef.withName("node-exporter") + - clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + - clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "node-exporter", namespace: namespace}]) -} diff --git a/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet b/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet deleted file mode 100644 index 426e0a66..00000000 --- a/jsonnet/node-exporter/node-exporter-cluster-role.libsonnet +++ /dev/null @@ -1,26 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRole = k.rbac.v1.clusterRole; -local policyRule = clusterRole.rulesType; - -local authenticationRole = policyRule.new() + - policyRule.withApiGroups(["authentication.k8s.io"]) + - policyRule.withResources([ - "tokenreviews", - ]) + - policyRule.withVerbs(["create"]); - -local authorizationRole = policyRule.new() + - policyRule.withApiGroups(["authorization.k8s.io"]) + - policyRule.withResources([ - "subjectaccessreviews", - ]) + - policyRule.withVerbs(["create"]); - -local rules = [authenticationRole, authorizationRole]; - -{ - new():: - clusterRole.new() + - clusterRole.mixin.metadata.withName("node-exporter") + - clusterRole.withRules(rules) -} diff --git a/jsonnet/node-exporter/node-exporter-daemonset.libsonnet b/jsonnet/node-exporter/node-exporter-daemonset.libsonnet deleted file mode 100644 index ac642891..00000000 --- a/jsonnet/node-exporter/node-exporter-daemonset.libsonnet +++ /dev/null @@ -1,58 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; - -local daemonset = k.apps.v1beta2.daemonSet; -local container = daemonset.mixin.spec.template.spec.containersType; -local volume = daemonset.mixin.spec.template.spec.volumesType; -local containerPort = container.portsType; -local containerVolumeMount = container.volumeMountsType; -local podSelector = daemonset.mixin.spec.template.spec.selectorType; - -local nodeExporterVersion = "v0.15.2"; -local kubeRbacProxyVersion = "v0.3.0"; -local podLabels = {"app": "node-exporter"}; - -local procVolumeName = "proc"; -local procVolume = volume.fromHostPath(procVolumeName, "/proc"); -local procVolumeMount = containerVolumeMount.new(procVolumeName, "/host/proc"); - -local sysVolumeName = "sys"; -local sysVolume = volume.fromHostPath(sysVolumeName, "/sys"); -local sysVolumeMount = containerVolumeMount.new(sysVolumeName, "/host/sys"); - -local nodeExporter = - container.new("node-exporter", "quay.io/prometheus/node-exporter:" + nodeExporterVersion) + - container.withArgs([ - "--web.listen-address=127.0.0.1:9101", - "--path.procfs=/host/proc", - "--path.sysfs=/host/sys", - ]) + - container.withVolumeMounts([procVolumeMount, sysVolumeMount]) + - container.mixin.resources.withRequests({cpu: "102m", memory: "180Mi"}) + - container.mixin.resources.withLimits({cpu: "102m", memory: "180Mi"}); - -local proxy = - container.new("kube-rbac-proxy", "quay.io/coreos/kube-rbac-proxy:" + kubeRbacProxyVersion) + - container.withArgs([ - "--secure-listen-address=:9100", - "--upstream=http://127.0.0.1:9101/", - ]) + - container.withPorts(containerPort.newNamed("https", 9100)) + - container.mixin.resources.withRequests({cpu: "10m", memory: "20Mi"}) + - container.mixin.resources.withLimits({cpu: "20m", memory: "40Mi"}); - -local c = [nodeExporter, proxy]; - -{ - new(namespace):: - daemonset.new() + - daemonset.mixin.metadata.withName("node-exporter") + - daemonset.mixin.metadata.withNamespace(namespace) + - daemonset.mixin.metadata.withLabels(podLabels) + - daemonset.mixin.spec.selector.withMatchLabels(podLabels) + - daemonset.mixin.spec.template.metadata.withLabels(podLabels) + - daemonset.mixin.spec.template.spec.withContainers(c) + - daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume]) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - daemonset.mixin.spec.template.spec.withServiceAccountName("node-exporter") -} diff --git a/jsonnet/node-exporter/node-exporter-service-account.libsonnet b/jsonnet/node-exporter/node-exporter-service-account.libsonnet deleted file mode 100644 index f75a6827..00000000 --- a/jsonnet/node-exporter/node-exporter-service-account.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local serviceAccount = k.core.v1.serviceAccount; - -{ - new(namespace):: - serviceAccount.new("node-exporter") + - serviceAccount.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/node-exporter/node-exporter-service-monitor.libsonnet b/jsonnet/node-exporter/node-exporter-service-monitor.libsonnet deleted file mode 100644 index d1ff25e7..00000000 --- a/jsonnet/node-exporter/node-exporter-service-monitor.libsonnet +++ /dev/null @@ -1,38 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "node-exporter", - "namespace": namespace, - "labels": { - "k8s-app": "node-exporter" - } - }, - "spec": { - "jobLabel": "k8s-app", - "selector": { - "matchLabels": { - "k8s-app": "node-exporter" - } - }, - "namespaceSelector": { - "matchNames": [ - "monitoring" - ] - }, - "endpoints": [ - { - "port": "https", - "scheme": "https", - "interval": "30s", - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token", - "tlsConfig": { - "insecureSkipVerify": true - } - } - ] - } - } -} diff --git a/jsonnet/node-exporter/node-exporter-service.libsonnet b/jsonnet/node-exporter/node-exporter-service.libsonnet deleted file mode 100644 index addbc598..00000000 --- a/jsonnet/node-exporter/node-exporter-service.libsonnet +++ /dev/null @@ -1,14 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -local nodeExporterDaemonset = import "node-exporter-daemonset.libsonnet"; - -local nodeExporterPort = servicePort.newNamed("https", 9100, "https"); - -{ - new(namespace):: - service.new("node-exporter", nodeExporterDaemonset.new(namespace).spec.selector.matchLabels, nodeExporterPort) + - service.mixin.metadata.withNamespace(namespace) + - service.mixin.metadata.withLabels({"k8s-app": "node-exporter"}) -} diff --git a/jsonnet/node-exporter/node-exporter.libsonnet b/jsonnet/node-exporter/node-exporter.libsonnet deleted file mode 100644 index 5438f001..00000000 --- a/jsonnet/node-exporter/node-exporter.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -{ - clusterRoleBinding:: import "node-exporter-cluster-role-binding.libsonnet", - clusterRole:: import "node-exporter-cluster-role.libsonnet", - daemonset:: import "node-exporter-daemonset.libsonnet", - serviceAccount:: import "node-exporter-service-account.libsonnet", - service:: import "node-exporter-service.libsonnet", - serviceMonitor:: import "node-exporter-service-monitor.libsonnet", -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet deleted file mode 100644 index 64453c52..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-cluster-role-binding.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - -{ - new(namespace):: - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName("prometheus-operator") + - clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - clusterRoleBinding.mixin.roleRef.withName("prometheus-operator") + - clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + - clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "prometheus-operator", namespace: namespace}]) -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet deleted file mode 100644 index 858d7542..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-cluster-role.libsonnet +++ /dev/null @@ -1,81 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRole = k.rbac.v1.clusterRole; -local policyRule = clusterRole.rulesType; - -local extensionsRule = policyRule.new() + - policyRule.withApiGroups(["extensions"]) + - policyRule.withResources([ - "thirdpartyresources", - ]) + - policyRule.withVerbs(["*"]); - -local apiExtensionsRule = policyRule.new() + - policyRule.withApiGroups(["apiextensions.k8s.io"]) + - policyRule.withResources([ - "customresourcedefinitions", - ]) + - policyRule.withVerbs(["*"]); - -local monitoringRule = policyRule.new() + - policyRule.withApiGroups(["monitoring.coreos.com"]) + - policyRule.withResources([ - "alertmanagers", - "prometheuses", - "prometheuses/finalizers", - "alertmanagers/finalizers", - "servicemonitors", - ]) + - policyRule.withVerbs(["*"]); - -local appsRule = policyRule.new() + - policyRule.withApiGroups(["apps"]) + - policyRule.withResources([ - "statefulsets", - ]) + - policyRule.withVerbs(["*"]); - -local coreRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "configmaps", - "secrets", - ]) + - policyRule.withVerbs(["*"]); - -local podRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "pods", - ]) + - policyRule.withVerbs(["list", "delete"]); - -local routingRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "services", - "endpoints", - ]) + - policyRule.withVerbs(["get", "create", "update"]); - -local nodeRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "nodes", - ]) + - policyRule.withVerbs(["list", "watch"]); - -local namespaceRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "namespaces", - ]) + - policyRule.withVerbs(["list"]); - -local rules = [extensionsRule, apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; - -{ - new():: - clusterRole.new() + - clusterRole.mixin.metadata.withName("prometheus-operator") + - clusterRole.withRules(rules) -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet deleted file mode 100644 index f8c9ca2a..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-deployment.libsonnet +++ /dev/null @@ -1,28 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; - -local version = "v0.19.0"; - -local deployment = k.apps.v1beta2.deployment; -local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; -local containerPort = container.portsType; - -local targetPort = 8080; -local podLabels = {"k8s-app": "prometheus-operator"}; - -local operatorContainer = - container.new("prometheus-operator", "quay.io/coreos/prometheus-operator:" + version) + - container.withPorts(containerPort.newNamed("http", targetPort)) + - container.withArgs(["--kubelet-service=kube-system/kubelet", "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1"]) + - container.mixin.resources.withRequests({cpu: "100m", memory: "50Mi"}) + - container.mixin.resources.withLimits({cpu: "200m", memory: "100Mi"}); - -{ - new(namespace):: - deployment.new("prometheus-operator", 1, operatorContainer, podLabels) + - deployment.mixin.metadata.withNamespace(namespace) + - deployment.mixin.metadata.withLabels(podLabels) + - deployment.mixin.spec.selector.withMatchLabels(podLabels) + - deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - deployment.mixin.spec.template.spec.withServiceAccountName("prometheus-operator") -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet deleted file mode 100644 index 791ce93c..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-service-account.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local serviceAccount = k.core.v1.serviceAccount; - -{ - new(namespace):: - serviceAccount.new("prometheus-operator") + - serviceAccount.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet deleted file mode 100644 index 07613f8c..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-service-monitor.libsonnet +++ /dev/null @@ -1,26 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "prometheus-operator", - "namespace": namespace, - "labels": { - "k8s-app": "prometheus-operator" - } - }, - "spec": { - "endpoints": [ - { - "port": "http" - } - ], - "selector": { - "matchLabels": { - "k8s-app": "prometheus-operator" - } - } - } - } -} diff --git a/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet b/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet deleted file mode 100644 index 8bbd1477..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator-service.libsonnet +++ /dev/null @@ -1,14 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -local poDeployment = import "prometheus-operator-deployment.libsonnet"; - -local poServicePort = servicePort.newNamed("http", 8080, "http"); - - -{ - new(namespace):: - service.new("prometheus-operator", poDeployment.new(namespace).spec.selector.matchLabels, [poServicePort]) + - service.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/prometheus-operator/prometheus-operator.libsonnet deleted file mode 100644 index 3659250d..00000000 --- a/jsonnet/prometheus-operator/prometheus-operator.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -{ - clusterRoleBinding:: import "prometheus-operator-cluster-role-binding.libsonnet", - clusterRole:: import "prometheus-operator-cluster-role.libsonnet", - deployment:: import "prometheus-operator-deployment.libsonnet", - serviceAccount:: import "prometheus-operator-service-account.libsonnet", - service:: import "prometheus-operator-service.libsonnet", - serviceMonitor:: import "prometheus-operator-service-monitor.libsonnet", -} diff --git a/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet b/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet deleted file mode 100644 index d577bee5..00000000 --- a/jsonnet/prometheus/prometheus-k8s-cluster-role-binding.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - -{ - new(namespace):: - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName("prometheus-k8s") + - clusterRoleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - clusterRoleBinding.mixin.roleRef.withName("prometheus-k8s") + - clusterRoleBinding.mixin.roleRef.mixinInstance({kind: "ClusterRole"}) + - clusterRoleBinding.withSubjects([{kind: "ServiceAccount", name: "prometheus-k8s", namespace: namespace}]) -} diff --git a/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet b/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet deleted file mode 100644 index c514624c..00000000 --- a/jsonnet/prometheus/prometheus-k8s-cluster-role.libsonnet +++ /dev/null @@ -1,21 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local clusterRole = k.rbac.v1.clusterRole; -local policyRule = clusterRole.rulesType; - -local nodeMetricsRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources(["nodes/metrics"]) + - policyRule.withVerbs(["get"]); - -local metricsRule = policyRule.new() + - policyRule.withNonResourceUrls("/metrics") + - policyRule.withVerbs(["get"]); - -local rules = [nodeMetricsRule, metricsRule]; - -{ - new():: - clusterRole.new() + - clusterRole.mixin.metadata.withName("prometheus-k8s") + - clusterRole.withRules(rules) -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet deleted file mode 100644 index 631e5fa5..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; - -{ - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config", "prometheus-k8s") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet deleted file mode 100644 index e88ece99..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; - -{ - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s", "prometheus-k8s") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet deleted file mode 100644 index 33967e0a..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; - -{ - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s", "prometheus-k8s") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet deleted file mode 100644 index d70ed6ac..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; - -{ - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s", "prometheus-k8s") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet deleted file mode 100644 index abd43433..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-config.libsonnet +++ /dev/null @@ -1,18 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local role = k.rbac.v1.role; -local policyRule = role.rulesType; - -local configmapRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "configmaps", - ]) + - policyRule.withVerbs(["get"]); - -{ - new(namespace):: - role.new() + - role.mixin.metadata.withName("prometheus-k8s-config") + - role.mixin.metadata.withNamespace(namespace) + - role.withRules(configmapRule), -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet deleted file mode 100644 index a9abbb1a..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-default.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; - -{ - new():: prometheusNamespaceRole.new("default") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet deleted file mode 100644 index f1ee9860..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-kube-system.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; - -{ - new():: prometheusNamespaceRole.new("kube-system") -} diff --git a/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet deleted file mode 100644 index 3149cbf0..00000000 --- a/jsonnet/prometheus/prometheus-k8s-role-namespace.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -local prometheusNamespaceRole = import "prometheus-namespace-role.libsonnet"; - -{ - new(namespace):: prometheusNamespaceRole.new(namespace) -} diff --git a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet b/jsonnet/prometheus/prometheus-k8s-rules.libsonnet deleted file mode 100644 index d2014569..00000000 --- a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet +++ /dev/null @@ -1,9 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local configMap = k.core.v1.configMap; - -{ - new(namespace, ruleFiles):: - configMap.new("prometheus-k8s-rules", ruleFiles) + - configMap.mixin.metadata.withLabels({role: "alert-rules", prometheus: "k8s"}) + - configMap.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet deleted file mode 100644 index e8164556..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-account.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local serviceAccount = k.core.v1.serviceAccount; - -{ - new(namespace):: - serviceAccount.new("prometheus-k8s") + - serviceAccount.mixin.metadata.withNamespace(namespace) -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet deleted file mode 100644 index e53ed231..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-apiserver.libsonnet +++ /dev/null @@ -1,40 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "kube-apiserver", - "namespace": namespace, - "labels": { - "k8s-app": "apiserver" - } - }, - "spec": { - "jobLabel": "component", - "selector": { - "matchLabels": { - "component": "apiserver", - "provider": "kubernetes" - } - }, - "namespaceSelector": { - "matchNames": [ - "default" - ] - }, - "endpoints": [ - { - "port": "https", - "interval": "30s", - "scheme": "https", - "tlsConfig": { - "caFile": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", - "serverName": "kubernetes" - }, - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" - } - ] - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet deleted file mode 100644 index 89afb452..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-coredns.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "coredns", - "namespace": namespace, - "labels": { - "k8s-app": "coredns" - }, - }, - "spec": { - "jobLabel": "k8s-app", - "selector": { - "matchLabels": { - "k8s-app": "coredns", - "component": "metrics" - } - }, - "namespaceSelector": { - "matchNames": [ - "kube-system" - ] - }, - "endpoints": [ - { - "port": "http-metrics", - "interval": "15s", - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" - } - ] - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet deleted file mode 100644 index 447e8a4b..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.libsonnet +++ /dev/null @@ -1,33 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "kube-controller-manager", - "namespace": namespace, - "labels": { - "k8s-app": "kube-controller-manager" - } - }, - "spec": { - "jobLabel": "k8s-app", - "endpoints": [ - { - "port": "http-metrics", - "interval": "30s" - } - ], - "selector": { - "matchLabels": { - "k8s-app": "kube-controller-manager" - } - }, - "namespaceSelector": { - "matchNames": [ - "kube-system" - ] - } - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet deleted file mode 100644 index eaae0c39..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-kube-scheduler.libsonnet +++ /dev/null @@ -1,33 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "kube-scheduler", - "namespace": namespace, - "labels": { - "k8s-app": "kube-scheduler" - } - }, - "spec": { - "jobLabel": "k8s-app", - "endpoints": [ - { - "port": "http-metrics", - "interval": "30s" - } - ], - "selector": { - "matchLabels": { - "k8s-app": "kube-scheduler" - } - }, - "namespaceSelector": { - "matchNames": [ - "kube-system" - ] - } - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet deleted file mode 100644 index 6b7dd28e..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-kubelet.libsonnet +++ /dev/null @@ -1,49 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "kubelet", - "namespace": namespace, - "labels": { - "k8s-app": "kubelet" - } - }, - "spec": { - "jobLabel": "k8s-app", - "endpoints": [ - { - "port": "https-metrics", - "scheme": "https", - "interval": "30s", - "tlsConfig": { - "insecureSkipVerify": true - }, - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" - }, - { - "port": "https-metrics", - "scheme": "https", - "path": "/metrics/cadvisor", - "interval": "30s", - "honorLabels": true, - "tlsConfig": { - "insecureSkipVerify": true - }, - "bearerTokenFile": "/var/run/secrets/kubernetes.io/serviceaccount/token" - } - ], - "selector": { - "matchLabels": { - "k8s-app": "kubelet" - } - }, - "namespaceSelector": { - "matchNames": [ - "kube-system" - ] - } - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet b/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet deleted file mode 100644 index 0f4ef084..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service-monitor-prometheus.libsonnet +++ /dev/null @@ -1,32 +0,0 @@ -{ - new(namespace):: - { - "apiVersion": "monitoring.coreos.com/v1", - "kind": "ServiceMonitor", - "metadata": { - "name": "prometheus", - "namespace": namespace, - "labels": { - "k8s-app": "prometheus" - } - }, - "spec": { - "selector": { - "matchLabels": { - "prometheus": "k8s" - } - }, - "namespaceSelector": { - "matchNames": [ - "monitoring" - ] - }, - "endpoints": [ - { - "port": "web", - "interval": "30s" - } - ] - } - } -} diff --git a/jsonnet/prometheus/prometheus-k8s-service.libsonnet b/jsonnet/prometheus/prometheus-k8s-service.libsonnet deleted file mode 100644 index add240dd..00000000 --- a/jsonnet/prometheus/prometheus-k8s-service.libsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local service = k.core.v1.service; -local servicePort = k.core.v1.service.mixin.spec.portsType; - -local prometheusPort = servicePort.newNamed("web", 9090, "web"); - -{ - new(namespace):: - service.new("prometheus-k8s", {app: "prometheus", prometheus: "k8s"}, prometheusPort) + - service.mixin.metadata.withNamespace(namespace) + - service.mixin.metadata.withLabels({prometheus: "k8s"}) -} diff --git a/jsonnet/prometheus/prometheus-k8s.libsonnet b/jsonnet/prometheus/prometheus-k8s.libsonnet deleted file mode 100644 index 853f62b1..00000000 --- a/jsonnet/prometheus/prometheus-k8s.libsonnet +++ /dev/null @@ -1,43 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; - -local container = k.core.v1.pod.mixin.spec.containersType; -local resourceRequirements = container.mixin.resourcesType; -local selector = k.apps.v1beta2.deployment.mixin.spec.selectorType; - -local resources = resourceRequirements.new() + - resourceRequirements.withRequests({memory: "400Mi"}); - -{ - new(namespace):: - { - apiVersion: "monitoring.coreos.com/v1", - kind: "Prometheus", - metadata: { - name: "k8s", - namespace: namespace, - labels: { - prometheus: "k8s", - }, - }, - spec: { - replicas: 2, - version: "v2.2.1", - serviceAccountName: "prometheus-k8s", - serviceMonitorSelector: selector.withMatchExpressions({key: "k8s-app", operator: "Exists"}), - ruleSelector: selector.withMatchLabels({ - role: "alert-rules", - prometheus: "k8s", - }), - resources: resources, - alerting: { - alertmanagers: [ - { - namespace: "monitoring", - name: "alertmanager-main", - port: "web", - }, - ], - }, - }, - } -} diff --git a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet deleted file mode 100644 index a63bcc9c..00000000 --- a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet +++ /dev/null @@ -1,13 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local roleBinding = k.rbac.v1.roleBinding; - -{ - new(serviceAccountNamespace, namespace, roleName, serviceAccountName):: - roleBinding.new() + - roleBinding.mixin.metadata.withName(roleName) + - roleBinding.mixin.metadata.withNamespace(namespace) + - roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - roleBinding.mixin.roleRef.withName(roleName) + - roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + - roleBinding.withSubjects([{kind: "ServiceAccount", name: serviceAccountName, namespace: serviceAccountNamespace}]) -} diff --git a/jsonnet/prometheus/prometheus-namespace-role.libsonnet b/jsonnet/prometheus/prometheus-namespace-role.libsonnet deleted file mode 100644 index 5afdcff4..00000000 --- a/jsonnet/prometheus/prometheus-namespace-role.libsonnet +++ /dev/null @@ -1,21 +0,0 @@ -local k = import "ksonnet.beta.3/k.libsonnet"; -local role = k.rbac.v1.role; -local policyRule = role.rulesType; - -{ - new(namespace):: - local coreRule = policyRule.new() + - policyRule.withApiGroups([""]) + - policyRule.withResources([ - "nodes", - "services", - "endpoints", - "pods", - ]) + - policyRule.withVerbs(["get", "list", "watch"]); - - role.new() + - role.mixin.metadata.withName("prometheus-k8s") + - role.mixin.metadata.withNamespace(namespace) + - role.withRules(coreRule) -} diff --git a/jsonnet/prometheus/prometheus.libsonnet b/jsonnet/prometheus/prometheus.libsonnet deleted file mode 100644 index 1ba4f55d..00000000 --- a/jsonnet/prometheus/prometheus.libsonnet +++ /dev/null @@ -1,22 +0,0 @@ -{ - clusterRoleBinding:: import "prometheus-k8s-cluster-role-binding.libsonnet", - clusterRole:: import "prometheus-k8s-cluster-role.libsonnet", - roleBindingConfig:: import "prometheus-k8s-role-binding-config.libsonnet", - roleBindingNamespace:: import "prometheus-k8s-role-binding-namespace.libsonnet", - roleBindingKubeSystem:: import "prometheus-k8s-role-binding-kube-system.libsonnet", - roleBindingDefault:: import "prometheus-k8s-role-binding-default.libsonnet", - roleConfig:: import "prometheus-k8s-role-config.libsonnet", - roleNamespace:: import "prometheus-k8s-role-namespace.libsonnet", - roleKubeSystem:: import "prometheus-k8s-role-kube-system.libsonnet", - roleDefault:: import "prometheus-k8s-role-default.libsonnet", - rules:: import "prometheus-k8s-rules.libsonnet", - serviceAccount:: import "prometheus-k8s-service-account.libsonnet", - serviceMonitorApiserver:: import "prometheus-k8s-service-monitor-apiserver.libsonnet", - serviceMonitorCoreDNS:: import "prometheus-k8s-service-monitor-coredns.libsonnet", - serviceMonitorControllerManager:: import "prometheus-k8s-service-monitor-kube-controller-manager.libsonnet", - serviceMonitorScheduler:: import "prometheus-k8s-service-monitor-kube-scheduler.libsonnet", - serviceMonitorKubelet:: import "prometheus-k8s-service-monitor-kubelet.libsonnet", - serviceMonitorPrometheus:: import "prometheus-k8s-service-monitor-prometheus.libsonnet", - service:: import "prometheus-k8s-service.libsonnet", - prometheus:: import "prometheus-k8s.libsonnet", -} diff --git a/manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml b/manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml new file mode 100644 index 00000000..b0976073 --- /dev/null +++ b/manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml @@ -0,0 +1,2270 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: alertmanagers.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: Alertmanager + plural: alertmanagers + scope: Namespaced + validation: + openAPIV3Schema: + description: Describes an Alertmanager cluster. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: 'Specification of the desired behavior of the Alertmanager + cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + affinity: + description: Affinity is a group of affinity scheduling rules. + properties: + nodeAffinity: + description: Node affinity is a group of node affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term matches all + objects with implicit weight 0 (i.e. it's a no-op). A null + preferred scheduling term matches no objects (i.e. is also + a no-op). + properties: + preference: + description: A null or empty node selector term matches + no objects. + properties: + matchExpressions: + description: Required. A list of node selector requirements. + The requirements are ANDed. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + required: + - matchExpressions + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - preference + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: A node selector represents the union of the results + of one or more label queries over a set of nodes; that is, + it represents the OR of the selectors represented by the node + selector terms. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: A null or empty node selector term matches + no objects. + properties: + matchExpressions: + description: Required. A list of node selector requirements. + The requirements are ANDed. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + required: + - matchExpressions + type: array + required: + - nodeSelectorTerms + podAffinity: + description: Pod affinity is a group of inter pod affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node has pods which matches the corresponding podAffinityTerm; + the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this + field are not met at scheduling time, the pod will not be + scheduled onto the node. If the affinity requirements specified + by this field cease to be met at some point during pod execution + (e.g. due to a pod label update), the system may or may not + try to eventually evict the pod from its node. When there + are multiple elements, the lists of nodes corresponding to + each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + podAntiAffinity: + description: Pod anti affinity is a group of inter pod anti affinity + scheduling rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the anti-affinity expressions specified by this + field, but it may choose a node that violates one or more + of the expressions. The node that is most preferred is the + one with the greatest sum of weights, i.e. for each node that + meets all of the scheduling requirements (resource request, + requiredDuringScheduling anti-affinity expressions, etc.), + compute a sum by iterating through the elements of this field + and adding "weight" to the sum if the node has pods which + matches the corresponding podAffinityTerm; the node(s) with + the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements specified by + this field are not met at scheduling time, the pod will not + be scheduled onto the node. If the anti-affinity requirements + specified by this field cease to be met at some point during + pod execution (e.g. due to a pod label update), the system + may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + baseImage: + description: Base image that is used to deploy pods, without tag. + type: string + containers: + description: Containers allows injecting additional containers. This + is meant to allow adding an authentication proxy to an Alertmanager + pod. + items: + description: A single application container that you want to run within + a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s + CMD is used if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. If a variable + cannot be resolved, the reference in the input string will be + unchanged. The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never be expanded, + regardless of whether the variable exists or not. Cannot be + updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The + docker image''s ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container''s + environment. If a variable cannot be resolved, the reference + in the input string will be unchanged. The $(VAR_NAME) syntax + can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable exists + or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be a + C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded + using the previous defined environment variables in the + container and any service environment variables. If a + variable cannot be resolved, the reference in the input + string will be unchanged. The $(VAR_NAME) syntax can be + escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: EnvVarSource represents a source for the value + of an EnvVar. + properties: + configMapKeyRef: + description: Selects a key from a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap or it's + key must be defined + type: boolean + required: + - key + fieldRef: + description: ObjectFieldSelector selects an APIVersioned + field of an object. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + resourceFieldRef: + description: ResourceFieldSelector represents container + resources (cpu, memory) and their output format + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: {} + resource: + description: 'Required: resource to select' + type: string + required: + - resource + secretKeyRef: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's + key must be defined + type: boolean + required: + - key + required: + - name + type: array + envFrom: + description: List of sources to populate environment variables + in the container. The keys defined within a source must be a + C_IDENTIFIER. All invalid keys will be reported as an event + when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take + precedence. Values defined by an Env with a duplicate key will + take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of + ConfigMaps + properties: + configMapRef: + description: |- + ConfigMapEnvSource selects a ConfigMap to populate the environment variables with. + + The contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + prefix: + description: An optional identifer to prepend to each key + in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: |- + SecretEnvSource selects a Secret to populate the environment variables with. + + The contents of the target Secret's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management + to default or override container images in workload controllers + like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent + otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle describes actions that the management system + should take in response to container lifecycle events. For the + PostStart and PreStop lifecycle handlers, management of the + container blocks until the action is complete, unless the container + process fails, in which case the handler is aborted. + properties: + postStart: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + preStop: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + livenessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + name: + description: Name of the container specified as a DNS_LABEL. Each + container in a pod must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing + a port here gives the system additional information about the + network connections a container uses, but is primarily informational. + Not specifying a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default "0.0.0.0" + address inside a container will be accessible from the network. + Cannot be updated. + items: + description: ContainerPort represents a network port in a single + container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, + this must be a valid port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. Most containers + do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME + and unique within the pod. Each named port in a pod must + have a unique name. Name for the port that can be referred + to by services. + type: string + protocol: + description: Protocol for port. Must be UDP or TCP. Defaults + to "TCP". + type: string + required: + - containerPort + type: array + readinessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + securityContext: + description: SecurityContext holds security configuration that + will be applied to a container. Some fields are present in both + SecurityContext and PodSecurityContext. When both are set, + the values in SecurityContext take precedence. + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a + process can gain more privileges than its parent process. + This bool directly controls if the no_new_privs flag will + be set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run as Privileged + 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: Adds and removes POSIX capabilities from running + containers. + properties: + add: + description: Added capabilities + items: + type: string + type: array + drop: + description: Removed capabilities + items: + type: string + type: array + privileged: + description: Run container in privileged mode. Processes in + privileged containers are essentially equivalent to root + on the host. Defaults to false. + type: boolean + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. + Default is false. + type: boolean + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail + to start the container if it does. If unset or false, no + such validation will be performed. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container + process. Defaults to user specified in image metadata if + unspecified. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to + the container + properties: + level: + description: Level is SELinux level label that applies + to the container. + type: string + role: + description: Role is a SELinux role label that applies + to the container. + type: string + type: + description: Type is a SELinux type label that applies + to the container. + type: string + user: + description: User is a SELinux user label that applies + to the container. + type: string + stdin: + description: Whether this container should allocate a buffer for + stdin in the container runtime. If this is not set, reads from + stdin in the container will always result in EOF. Default is + false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin + channel after it has been opened by a single attach. When stdin + is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container + start, is empty until the first client attaches to stdin, and + then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container + is restarted. If this flag is false, a container processes that + reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s + termination message will be written is mounted into the container''s + filesystem. Message written is intended to be brief final status, + such as an assertion failure message. Will be truncated by the + node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. + File will use the contents of terminationMessagePath to populate + the container status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for + itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be + used by the container. This is an alpha feature and may change + in the future. + items: + description: volumeDevice describes a mapping of a raw block + device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container + that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim + in the pod + type: string + required: + - name + - devicePath + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. + Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within + a container. + properties: + mountPath: + description: Path within the container at which the volume + should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are + propagated from the host to container and the other way + around. When not set, MountPropagationHostToContainer + is used. This field is alpha in 1.8 and can be reworked + or removed in a future release. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise + (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's + volume should be mounted. Defaults to "" (volume's root). + type: string + required: + - name + - mountPath + type: array + workingDir: + description: Container's working directory. If not specified, + the container runtime's default will be used, which might be + configured in the container image. Cannot be updated. + type: string + required: + - name + type: array + externalUrl: + description: The external URL the Alertmanager instances will be available + under. This is necessary to generate correct URLs. This is necessary + if Alertmanager is not served from root of a DNS name. + type: string + imagePullSecrets: + description: An optional list of references to secrets in the same namespace + to use for pulling prometheus and alertmanager images from registries + see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod + items: + description: LocalObjectReference contains enough information to let + you locate the referenced object inside the same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + type: array + listenLocal: + description: ListenLocal makes the Alertmanager server listen on loopback, + so that it does not bind against the Pod IP. Note this is only for + the Alertmanager UI, not the gossip communication. + type: boolean + logLevel: + description: Log level for Alertmanager to be configured with. + type: string + nodeSelector: + description: Define which Nodes the Pods are scheduled on. + type: object + paused: + description: If set to true all actions on the underlaying managed objects + are not goint to be performed, except for delete actions. + type: boolean + podMetadata: + description: ObjectMeta is metadata that all persisted resources must + have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored + with a resource that may be set by external tools to store and + retrieve arbitrary metadata. They are not queryable and should + be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. + This is used to distinguish resources with same name and namespace + in different clusters. This field is not set anywhere right now + and apiserver is going to ignore it if set in create or update + request. + type: string + creationTimestamp: + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set + when deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the + registry. Each entry is an identifier for the responsible component + that will remove the entry from the list. If the deletionTimestamp + of the object is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the + initializers struct will be set to nil and the object is considered + as initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible for + initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of + this representation of an object. Servers should convert + recognized schemas to the latest internal value, and may + reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, + 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object + defines what attributes will be set. Clients must ignore + fields that do not match the defined type of each attribute, + and should assume that any attribute may be empty, invalid, + or under defined. + properties: + causes: + description: The Causes array includes more details + associated with the StatusReason failure. Not all + StatusReasons may provide detailed causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases when + multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the + cause of the error. This field may be presented + as-is to a reader. + type: string + reason: + description: A machine-readable description of + the cause of the error. If this value is empty + there is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may + differ from the requested resource Kind. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single + name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before + the operation should be retried. Some errors may indicate + the client must take an alternate action - for those + errors this field may indicate how long to wait before + taking the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a + single resource which can be described). More info: + http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST + resource this object represents. Servers may infer this + from the endpoint the client submits requests to. Cannot + be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status + of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various status + objects. A resource may have only one of {ObjectMeta, + ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that + the server has more data available. The value is opaque + and may be used to issue another request to the endpoint + that served this list to retrieve the next set of + available objects. Continuing a list may not be possible + if the server configuration has changed or more than + a few minutes have passed. The resourceVersion field + returned when using this continue value will be identical + to the value in the first response. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients + to determine when objects have changed. Value must + be treated as opaque by clients and passed unmodified + back to the server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this + operation is in the "Failure" status. If this value is + empty there is no information available. A Reason clarifies + an HTTP status code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to + organize and categorize (scope and select) objects. May match + selectors of replication controllers and services. More info: + http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required + when creating resources, although some resources may allow a client + to request the generation of an appropriate name automatically. + Name is primarily intended for creation idempotence and configuration + definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this + list will point to this controller, with the controller field + set to true. There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let + you identify an owning object. Currently, an owning object must + be in the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. + To set this field, a user needs "delete" permission of the + owner, otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing + controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated + by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + replicas: + description: Size is the expected size of the alertmanager cluster. + The controller will eventually make the size of the running cluster + equal to the expected size. + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute resources + allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute resources + required. If Requests is omitted for a container, it defaults + to Limits if that is explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + routePrefix: + description: The route prefix Alertmanager registers HTTP handlers for. + This is useful, if using ExternalURL and a proxy is rewriting HTTP + routes of a request, and the actual ExternalURL is still true, but + the server serves requests under a different route prefix. For example + for use with `kubectl proxy`. + type: string + secrets: + description: Secrets is a list of Secrets in the same namespace as the + Alertmanager object, which shall be mounted into the Alertmanager + Pods. The Secrets are mounted into /etc/alertmanager/secrets/. + items: + type: string + type: array + securityContext: + description: PodSecurityContext holds pod-level security attributes + and common container settings. Some fields are also present in container.securityContext. Field + values of container.securityContext take precedence over field values + of PodSecurityContext. + properties: + fsGroup: + description: |- + A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: + + 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- + + If unset, the Kubelet will not modify the ownership and permissions of any volume. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail to start + the container if it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If set + in both SecurityContext and PodSecurityContext, the value specified + in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. May + also be set in SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to the + container + properties: + level: + description: Level is SELinux level label that applies to the + container. + type: string + role: + description: Role is a SELinux role label that applies to the + container. + type: string + type: + description: Type is a SELinux type label that applies to the + container. + type: string + user: + description: User is a SELinux user label that applies to the + container. + type: string + supplementalGroups: + description: A list of groups applied to the first process run in + each container, in addition to the container's primary GID. If + unspecified, no groups will be added to any container. + items: + format: int64 + type: integer + type: array + serviceAccountName: + description: ServiceAccountName is the name of the ServiceAccount to + use to run the Prometheus Pods. + type: string + storage: + description: StorageSpec defines the configured storage for a group + Prometheus servers. + properties: + class: + description: 'Name of the StorageClass to use when requesting storage + provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses + DEPRECATED' + type: string + emptyDir: + description: Represents an empty directory for a pod. Empty directory + volumes support ownership management and SELinux relabeling. + properties: + medium: + description: 'What type of storage medium should back this directory. + The default is "" which means to use the node''s default medium. + Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: {} + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the + key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + volumeClaimTemplate: + description: PersistentVolumeClaim is a user's request for and claim + to a persistent volume + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources + must have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map + stored with a resource that may be set by external tools + to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs + to. This is used to distinguish resources with same name + and namespace in different clusters. This field is not + set anywhere right now and apiserver is going to ignore + it if set in create or update request. + type: string + creationTimestamp: + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to + gracefully terminate before it will be removed from the + system. Only set when deletionTimestamp is also set. May + only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted + from the registry. Each entry is an identifier for the + responsible component that will remove the entry from + the list. If the deletionTimestamp of the object is non-nil, + entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that + must execute in order before this object is visible. + When the last pending initializer is removed, and + no failing result is set, the initializers struct + will be set to nil and the object is considered as + initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible + for initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that + don't return other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema + of this representation of an object. Servers should + convert recognized schemas to the latest internal + value, and may reject unrecognized values. More + info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this + status, 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional + properties that MAY be set by the server to provide + additional information about a response. The Reason + field of a Status object defines what attributes + will be set. Clients must ignore fields that do + not match the defined type of each attribute, + and should assume that any attribute may be empty, + invalid, or under defined. + properties: + causes: + description: The Causes array includes more + details associated with the StatusReason failure. + Not all StatusReasons may provide detailed + causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases + when multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description + of the cause of the error. This field + may be presented as-is to a reader. + type: string + reason: + description: A machine-readable description + of the cause of the error. If this value + is empty there is no information available. + type: string + type: array + group: + description: The group attribute of the resource + associated with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource + associated with the status StatusReason. On + some operations may differ from the requested + resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource + associated with the status StatusReason (when + there is a single name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds + before the operation should be retried. Some + errors may indicate the client must take an + alternate action - for those errors this field + may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there + is a single resource which can be described). + More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing + the REST resource this object represents. Servers + may infer this from the endpoint the client submits + requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the + status of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various + status objects. A resource may have only one of + {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user + set a limit on the number of items returned, + and indicates that the server has more data + available. The value is opaque and may be + used to issue another request to the endpoint + that served this list to retrieve the next + set of available objects. Continuing a list + may not be possible if the server configuration + has changed or more than a few minutes have + passed. The resourceVersion field returned + when using this continue value will be identical + to the value in the first response. + type: string + resourceVersion: + description: 'String that identifies the server''s + internal version of this object that can be + used by clients to determine when objects + have changed. Value must be treated as opaque + by clients and passed unmodified back to the + server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing + this object. Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why + this operation is in the "Failure" status. If + this value is empty there is no information available. + A Reason clarifies an HTTP status code but does + not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be + used to organize and categorize (scope and select) objects. + May match selectors of replication controllers and services. + More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is + required when creating resources, although some resources + may allow a client to request the generation of an appropriate + name automatically. Name is primarily intended for creation + idempotence and configuration definition. Cannot be updated. + More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If + ALL objects in the list have been deleted, this object + will be garbage collected. If this object is managed by + a controller, then an entry in this list will point to + this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information + to let you identify an owning object. Currently, an + owning object must be in the same namespace, so there + is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from + the key-value store until this reference is removed. + Defaults to false. To set this field, a user needs + "delete" permission of the owner, otherwise 422 + (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the + managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'AccessModes contains the desired access modes + the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + resources: + description: ResourceRequirements describes the compute + resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of + compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + storageClassName: + description: 'Name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is required + by the claim. Value of Filesystem is implied when not + included in claim spec. This is an alpha feature and may + change in the future. + type: string + volumeName: + description: VolumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + status: + description: PersistentVolumeClaimStatus is the current status + of a persistent volume claim. + properties: + accessModes: + description: 'AccessModes contains the actual access modes + the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + capacity: + description: Represents the actual resources of the underlying + volume. + type: object + conditions: + description: Current Condition of persistent volume claim. + If underlying persistent volume is being resized then + the Condition will be set to 'ResizeStarted'. + items: + description: PersistentVolumeClaimCondition contails details + about state of pvc + properties: + lastProbeTime: + format: date-time + type: string + lastTransitionTime: + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, this should be a short, machine + understandable string that gives the reason for + condition's last transition. If it reports "ResizeStarted" + that means the underlying persistent volume is being + resized. + type: string + status: + type: string + type: + type: string + required: + - type + - status + type: array + phase: + description: Phase represents the current phase of PersistentVolumeClaim. + type: string + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to tolerates any + taint that matches the triple using the matching + operator . + properties: + effect: + description: Effect indicates the taint effect to match. Empty + means match all taint effects. When specified, allowed values + are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Key is the taint key that the toleration applies + to. Empty means match all taint keys. If the key is empty, operator + must be Exists; this combination means to match all values and + all keys. + type: string + operator: + description: Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. Exists + is equivalent to wildcard for value, so that a pod can tolerate + all taints of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period of time the + toleration (which must be of effect NoExecute, otherwise this + field is ignored) tolerates the taint. By default, it is not + set, which means tolerate the taint forever (do not evict). + Zero and negative values will be treated as 0 (evict immediately) + by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise + just a regular string. + type: string + type: array + version: + description: Version the cluster should be on. + type: string + status: + description: 'Most recent observed status of the Alertmanager cluster. Read-only. + Not included when requesting from the apiserver, only from the Prometheus + Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + availableReplicas: + description: Total number of available pods (ready for at least minReadySeconds) + targeted by this Alertmanager cluster. + format: int32 + type: integer + paused: + description: Represents whether any actions on the underlaying managed + objects are being performed. Only delete actions will be performed. + type: boolean + replicas: + description: Total number of non-terminated pods targeted by this Alertmanager + cluster (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: Total number of unavailable pods targeted by this Alertmanager + cluster. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted by this Alertmanager + cluster that have the desired version spec. + format: int32 + type: integer + required: + - paused + - replicas + - updatedReplicas + - availableReplicas + - unavailableReplicas + required: + - spec + version: v1 +status: + acceptedNames: + kind: "" + plural: "" + conditions: null diff --git a/manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml b/manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml new file mode 100644 index 00000000..e30f5bb2 --- /dev/null +++ b/manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml @@ -0,0 +1,2688 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: prometheuses.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: Prometheus + plural: prometheuses + scope: Namespaced + validation: + openAPIV3Schema: + description: Prometheus defines a Prometheus deployment. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: 'Specification of the desired behavior of the Prometheus cluster. + More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + additionalScrapeConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key + affinity: + description: Affinity is a group of affinity scheduling rules. + properties: + nodeAffinity: + description: Node affinity is a group of node affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term matches all + objects with implicit weight 0 (i.e. it's a no-op). A null + preferred scheduling term matches no objects (i.e. is also + a no-op). + properties: + preference: + description: A null or empty node selector term matches + no objects. + properties: + matchExpressions: + description: Required. A list of node selector requirements. + The requirements are ANDed. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + required: + - matchExpressions + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - preference + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: A node selector represents the union of the results + of one or more label queries over a set of nodes; that is, + it represents the OR of the selectors represented by the node + selector terms. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: A null or empty node selector term matches + no objects. + properties: + matchExpressions: + description: Required. A list of node selector requirements. + The requirements are ANDed. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + required: + - matchExpressions + type: array + required: + - nodeSelectorTerms + podAffinity: + description: Pod affinity is a group of inter pod affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node has pods which matches the corresponding podAffinityTerm; + the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this + field are not met at scheduling time, the pod will not be + scheduled onto the node. If the affinity requirements specified + by this field cease to be met at some point during pod execution + (e.g. due to a pod label update), the system may or may not + try to eventually evict the pod from its node. When there + are multiple elements, the lists of nodes corresponding to + each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + podAntiAffinity: + description: Pod anti affinity is a group of inter pod anti affinity + scheduling rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the anti-affinity expressions specified by this + field, but it may choose a node that violates one or more + of the expressions. The node that is most preferred is the + one with the greatest sum of weights, i.e. for each node that + meets all of the scheduling requirements (resource request, + requiredDuringScheduling anti-affinity expressions, etc.), + compute a sum by iterating through the elements of this field + and adding "weight" to the sum if the node has pods which + matches the corresponding podAffinityTerm; the node(s) with + the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements specified by + this field are not met at scheduling time, the pod will not + be scheduled onto the node. If the anti-affinity requirements + specified by this field cease to be met at some point during + pod execution (e.g. due to a pod label update), the system + may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + alerting: + description: AlertingSpec defines parameters for alerting configuration + of Prometheus servers. + properties: + alertmanagers: + description: AlertmanagerEndpoints Prometheus should fire alerts + against. + items: + description: AlertmanagerEndpoints defines a selection of a single + Endpoints object containing alertmanager IPs to fire alerts + against. + properties: + bearerTokenFile: + description: BearerTokenFile to read from filesystem to use + when authenticating to Alertmanager. + type: string + name: + description: Name of Endpoints object in Namespace. + type: string + namespace: + description: Namespace of Endpoints object. + type: string + pathPrefix: + description: Prefix for the HTTP path alerts are pushed to. + type: string + port: {} + scheme: + description: Scheme to use when firing alerts. + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + required: + - namespace + - name + - port + type: array + required: + - alertmanagers + baseImage: + description: Base image to use for a Prometheus deployment. + type: string + containers: + description: Containers allows injecting additional containers. This + is meant to allow adding an authentication proxy to a Prometheus pod. + items: + description: A single application container that you want to run within + a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s + CMD is used if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. If a variable + cannot be resolved, the reference in the input string will be + unchanged. The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never be expanded, + regardless of whether the variable exists or not. Cannot be + updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The + docker image''s ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container''s + environment. If a variable cannot be resolved, the reference + in the input string will be unchanged. The $(VAR_NAME) syntax + can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable exists + or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be a + C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded + using the previous defined environment variables in the + container and any service environment variables. If a + variable cannot be resolved, the reference in the input + string will be unchanged. The $(VAR_NAME) syntax can be + escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: EnvVarSource represents a source for the value + of an EnvVar. + properties: + configMapKeyRef: + description: Selects a key from a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap or it's + key must be defined + type: boolean + required: + - key + fieldRef: + description: ObjectFieldSelector selects an APIVersioned + field of an object. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + resourceFieldRef: + description: ResourceFieldSelector represents container + resources (cpu, memory) and their output format + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: {} + resource: + description: 'Required: resource to select' + type: string + required: + - resource + secretKeyRef: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's + key must be defined + type: boolean + required: + - key + required: + - name + type: array + envFrom: + description: List of sources to populate environment variables + in the container. The keys defined within a source must be a + C_IDENTIFIER. All invalid keys will be reported as an event + when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take + precedence. Values defined by an Env with a duplicate key will + take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of + ConfigMaps + properties: + configMapRef: + description: |- + ConfigMapEnvSource selects a ConfigMap to populate the environment variables with. + + The contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + prefix: + description: An optional identifer to prepend to each key + in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: |- + SecretEnvSource selects a Secret to populate the environment variables with. + + The contents of the target Secret's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management + to default or override container images in workload controllers + like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent + otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle describes actions that the management system + should take in response to container lifecycle events. For the + PostStart and PreStop lifecycle handlers, management of the + container blocks until the action is complete, unless the container + process fails, in which case the handler is aborted. + properties: + postStart: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + preStop: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + livenessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + name: + description: Name of the container specified as a DNS_LABEL. Each + container in a pod must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing + a port here gives the system additional information about the + network connections a container uses, but is primarily informational. + Not specifying a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default "0.0.0.0" + address inside a container will be accessible from the network. + Cannot be updated. + items: + description: ContainerPort represents a network port in a single + container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, + this must be a valid port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. Most containers + do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME + and unique within the pod. Each named port in a pod must + have a unique name. Name for the port that can be referred + to by services. + type: string + protocol: + description: Protocol for port. Must be UDP or TCP. Defaults + to "TCP". + type: string + required: + - containerPort + type: array + readinessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: {} + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: {} + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + securityContext: + description: SecurityContext holds security configuration that + will be applied to a container. Some fields are present in both + SecurityContext and PodSecurityContext. When both are set, + the values in SecurityContext take precedence. + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a + process can gain more privileges than its parent process. + This bool directly controls if the no_new_privs flag will + be set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run as Privileged + 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: Adds and removes POSIX capabilities from running + containers. + properties: + add: + description: Added capabilities + items: + type: string + type: array + drop: + description: Removed capabilities + items: + type: string + type: array + privileged: + description: Run container in privileged mode. Processes in + privileged containers are essentially equivalent to root + on the host. Defaults to false. + type: boolean + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. + Default is false. + type: boolean + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail + to start the container if it does. If unset or false, no + such validation will be performed. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container + process. Defaults to user specified in image metadata if + unspecified. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to + the container + properties: + level: + description: Level is SELinux level label that applies + to the container. + type: string + role: + description: Role is a SELinux role label that applies + to the container. + type: string + type: + description: Type is a SELinux type label that applies + to the container. + type: string + user: + description: User is a SELinux user label that applies + to the container. + type: string + stdin: + description: Whether this container should allocate a buffer for + stdin in the container runtime. If this is not set, reads from + stdin in the container will always result in EOF. Default is + false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin + channel after it has been opened by a single attach. When stdin + is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container + start, is empty until the first client attaches to stdin, and + then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container + is restarted. If this flag is false, a container processes that + reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s + termination message will be written is mounted into the container''s + filesystem. Message written is intended to be brief final status, + such as an assertion failure message. Will be truncated by the + node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. + File will use the contents of terminationMessagePath to populate + the container status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for + itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be + used by the container. This is an alpha feature and may change + in the future. + items: + description: volumeDevice describes a mapping of a raw block + device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container + that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim + in the pod + type: string + required: + - name + - devicePath + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. + Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within + a container. + properties: + mountPath: + description: Path within the container at which the volume + should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are + propagated from the host to container and the other way + around. When not set, MountPropagationHostToContainer + is used. This field is alpha in 1.8 and can be reworked + or removed in a future release. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise + (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's + volume should be mounted. Defaults to "" (volume's root). + type: string + required: + - name + - mountPath + type: array + workingDir: + description: Container's working directory. If not specified, + the container runtime's default will be used, which might be + configured in the container image. Cannot be updated. + type: string + required: + - name + type: array + evaluationInterval: + description: Interval between consecutive evaluations. + type: string + externalLabels: + description: The labels to add to any time series or alerts when communicating + with external systems (federation, remote storage, Alertmanager). + type: object + externalUrl: + description: The external URL the Prometheus instances will be available + under. This is necessary to generate correct URLs. This is necessary + if Prometheus is not served from root of a DNS name. + type: string + imagePullSecrets: + description: An optional list of references to secrets in the same namespace + to use for pulling prometheus and alertmanager images from registries + see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod + items: + description: LocalObjectReference contains enough information to let + you locate the referenced object inside the same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + type: array + listenLocal: + description: ListenLocal makes the Prometheus server listen on loopback, + so that it does not bind against the Pod IP. + type: boolean + logLevel: + description: Log level for Prometheus to be configured with. + type: string + nodeSelector: + description: Define which Nodes the Pods are scheduled on. + type: object + paused: + description: When a Prometheus deployment is paused, no actions except + for deletion will be performed on the underlying objects. + type: boolean + podMetadata: + description: ObjectMeta is metadata that all persisted resources must + have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored + with a resource that may be set by external tools to store and + retrieve arbitrary metadata. They are not queryable and should + be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. + This is used to distinguish resources with same name and namespace + in different clusters. This field is not set anywhere right now + and apiserver is going to ignore it if set in create or update + request. + type: string + creationTimestamp: + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set + when deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the + registry. Each entry is an identifier for the responsible component + that will remove the entry from the list. If the deletionTimestamp + of the object is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the + initializers struct will be set to nil and the object is considered + as initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible for + initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of + this representation of an object. Servers should convert + recognized schemas to the latest internal value, and may + reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, + 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object + defines what attributes will be set. Clients must ignore + fields that do not match the defined type of each attribute, + and should assume that any attribute may be empty, invalid, + or under defined. + properties: + causes: + description: The Causes array includes more details + associated with the StatusReason failure. Not all + StatusReasons may provide detailed causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases when + multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the + cause of the error. This field may be presented + as-is to a reader. + type: string + reason: + description: A machine-readable description of + the cause of the error. If this value is empty + there is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may + differ from the requested resource Kind. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single + name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before + the operation should be retried. Some errors may indicate + the client must take an alternate action - for those + errors this field may indicate how long to wait before + taking the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a + single resource which can be described). More info: + http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST + resource this object represents. Servers may infer this + from the endpoint the client submits requests to. Cannot + be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status + of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various status + objects. A resource may have only one of {ObjectMeta, + ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that + the server has more data available. The value is opaque + and may be used to issue another request to the endpoint + that served this list to retrieve the next set of + available objects. Continuing a list may not be possible + if the server configuration has changed or more than + a few minutes have passed. The resourceVersion field + returned when using this continue value will be identical + to the value in the first response. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients + to determine when objects have changed. Value must + be treated as opaque by clients and passed unmodified + back to the server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this + operation is in the "Failure" status. If this value is + empty there is no information available. A Reason clarifies + an HTTP status code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to + organize and categorize (scope and select) objects. May match + selectors of replication controllers and services. More info: + http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required + when creating resources, although some resources may allow a client + to request the generation of an appropriate name automatically. + Name is primarily intended for creation idempotence and configuration + definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this + list will point to this controller, with the controller field + set to true. There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let + you identify an owning object. Currently, an owning object must + be in the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. + To set this field, a user needs "delete" permission of the + owner, otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing + controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated + by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + remoteRead: + description: If specified, the remote_read spec. This is an experimental + feature, it may change in any upcoming release in a breaking way. + items: + description: RemoteReadSpec defines the remote_read configuration + for prometheus. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: bearer token for remote read. + type: string + bearerTokenFile: + description: File to read bearer token for remote read. + type: string + proxyUrl: + description: Optional ProxyURL + type: string + readRecent: + description: Whether reads should be made for queries for time + ranges that the local storage should have complete data for. + type: boolean + remoteTimeout: + description: Timeout for requests to the remote read endpoint. + type: string + requiredMatchers: + description: An optional list of equality matchers which have + to be present in a selector to query the remote read endpoint. + type: object + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + url: + description: The URL of the endpoint to send samples to. + type: string + required: + - url + type: array + remoteWrite: + description: If specified, the remote_write spec. This is an experimental + feature, it may change in any upcoming release in a breaking way. + items: + description: RemoteWriteSpec defines the remote_write configuration + for prometheus. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: File to read bearer token for remote write. + type: string + bearerTokenFile: + description: File to read bearer token for remote write. + type: string + proxyUrl: + description: Optional ProxyURL + type: string + remoteTimeout: + description: Timeout for requests to the remote write endpoint. + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + url: + description: The URL of the endpoint to send samples to. + type: string + writeRelabelConfigs: + description: The list of remote write relabel configurations. + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array + required: + - url + type: array + replicas: + description: Number of instances to deploy for a Prometheus deployment. + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute resources + allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute resources + required. If Requests is omitted for a container, it defaults + to Limits if that is explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + retention: + description: Time duration Prometheus shall retain data for. + type: string + routePrefix: + description: The route prefix Prometheus registers HTTP handlers for. + This is useful, if using ExternalURL and a proxy is rewriting HTTP + routes of a request, and the actual ExternalURL is still true, but + the server serves requests under a different route prefix. For example + for use with `kubectl proxy`. + type: string + ruleSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + scrapeInterval: + description: Interval between consecutive scrapes. + type: string + secrets: + description: Secrets is a list of Secrets in the same namespace as the + Prometheus object, which shall be mounted into the Prometheus Pods. + The Secrets are mounted into /etc/prometheus/secrets/. + Secrets changes after initial creation of a Prometheus object are + not reflected in the running Pods. To change the secrets mounted into + the Prometheus Pods, the object must be deleted and recreated with + the new list of secrets. + items: + type: string + type: array + securityContext: + description: PodSecurityContext holds pod-level security attributes + and common container settings. Some fields are also present in container.securityContext. Field + values of container.securityContext take precedence over field values + of PodSecurityContext. + properties: + fsGroup: + description: |- + A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: + + 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- + + If unset, the Kubelet will not modify the ownership and permissions of any volume. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail to start + the container if it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If set + in both SecurityContext and PodSecurityContext, the value specified + in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. May + also be set in SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to the + container + properties: + level: + description: Level is SELinux level label that applies to the + container. + type: string + role: + description: Role is a SELinux role label that applies to the + container. + type: string + type: + description: Type is a SELinux type label that applies to the + container. + type: string + user: + description: User is a SELinux user label that applies to the + container. + type: string + supplementalGroups: + description: A list of groups applied to the first process run in + each container, in addition to the container's primary GID. If + unspecified, no groups will be added to any container. + items: + format: int64 + type: integer + type: array + serviceAccountName: + description: ServiceAccountName is the name of the ServiceAccount to + use to run the Prometheus Pods. + type: string + serviceMonitorNamespaceSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + serviceMonitorSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + storage: + description: StorageSpec defines the configured storage for a group + Prometheus servers. + properties: + class: + description: 'Name of the StorageClass to use when requesting storage + provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses + DEPRECATED' + type: string + emptyDir: + description: Represents an empty directory for a pod. Empty directory + volumes support ownership management and SELinux relabeling. + properties: + medium: + description: 'What type of storage medium should back this directory. + The default is "" which means to use the node''s default medium. + Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: {} + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the + key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + volumeClaimTemplate: + description: PersistentVolumeClaim is a user's request for and claim + to a persistent volume + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources + must have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map + stored with a resource that may be set by external tools + to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs + to. This is used to distinguish resources with same name + and namespace in different clusters. This field is not + set anywhere right now and apiserver is going to ignore + it if set in create or update request. + type: string + creationTimestamp: + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to + gracefully terminate before it will be removed from the + system. Only set when deletionTimestamp is also set. May + only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted + from the registry. Each entry is an identifier for the + responsible component that will remove the entry from + the list. If the deletionTimestamp of the object is non-nil, + entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that + must execute in order before this object is visible. + When the last pending initializer is removed, and + no failing result is set, the initializers struct + will be set to nil and the object is considered as + initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible + for initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that + don't return other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema + of this representation of an object. Servers should + convert recognized schemas to the latest internal + value, and may reject unrecognized values. More + info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this + status, 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional + properties that MAY be set by the server to provide + additional information about a response. The Reason + field of a Status object defines what attributes + will be set. Clients must ignore fields that do + not match the defined type of each attribute, + and should assume that any attribute may be empty, + invalid, or under defined. + properties: + causes: + description: The Causes array includes more + details associated with the StatusReason failure. + Not all StatusReasons may provide detailed + causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases + when multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description + of the cause of the error. This field + may be presented as-is to a reader. + type: string + reason: + description: A machine-readable description + of the cause of the error. If this value + is empty there is no information available. + type: string + type: array + group: + description: The group attribute of the resource + associated with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource + associated with the status StatusReason. On + some operations may differ from the requested + resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource + associated with the status StatusReason (when + there is a single name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds + before the operation should be retried. Some + errors may indicate the client must take an + alternate action - for those errors this field + may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there + is a single resource which can be described). + More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing + the REST resource this object represents. Servers + may infer this from the endpoint the client submits + requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the + status of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various + status objects. A resource may have only one of + {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user + set a limit on the number of items returned, + and indicates that the server has more data + available. The value is opaque and may be + used to issue another request to the endpoint + that served this list to retrieve the next + set of available objects. Continuing a list + may not be possible if the server configuration + has changed or more than a few minutes have + passed. The resourceVersion field returned + when using this continue value will be identical + to the value in the first response. + type: string + resourceVersion: + description: 'String that identifies the server''s + internal version of this object that can be + used by clients to determine when objects + have changed. Value must be treated as opaque + by clients and passed unmodified back to the + server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing + this object. Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why + this operation is in the "Failure" status. If + this value is empty there is no information available. + A Reason clarifies an HTTP status code but does + not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be + used to organize and categorize (scope and select) objects. + May match selectors of replication controllers and services. + More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is + required when creating resources, although some resources + may allow a client to request the generation of an appropriate + name automatically. Name is primarily intended for creation + idempotence and configuration definition. Cannot be updated. + More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If + ALL objects in the list have been deleted, this object + will be garbage collected. If this object is managed by + a controller, then an entry in this list will point to + this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information + to let you identify an owning object. Currently, an + owning object must be in the same namespace, so there + is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from + the key-value store until this reference is removed. + Defaults to false. To set this field, a user needs + "delete" permission of the owner, otherwise 422 + (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the + managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'AccessModes contains the desired access modes + the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + resources: + description: ResourceRequirements describes the compute + resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of + compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + storageClassName: + description: 'Name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is required + by the claim. Value of Filesystem is implied when not + included in claim spec. This is an alpha feature and may + change in the future. + type: string + volumeName: + description: VolumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + status: + description: PersistentVolumeClaimStatus is the current status + of a persistent volume claim. + properties: + accessModes: + description: 'AccessModes contains the actual access modes + the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + capacity: + description: Represents the actual resources of the underlying + volume. + type: object + conditions: + description: Current Condition of persistent volume claim. + If underlying persistent volume is being resized then + the Condition will be set to 'ResizeStarted'. + items: + description: PersistentVolumeClaimCondition contails details + about state of pvc + properties: + lastProbeTime: + format: date-time + type: string + lastTransitionTime: + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, this should be a short, machine + understandable string that gives the reason for + condition's last transition. If it reports "ResizeStarted" + that means the underlying persistent volume is being + resized. + type: string + status: + type: string + type: + type: string + required: + - type + - status + type: array + phase: + description: Phase represents the current phase of PersistentVolumeClaim. + type: string + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to tolerates any + taint that matches the triple using the matching + operator . + properties: + effect: + description: Effect indicates the taint effect to match. Empty + means match all taint effects. When specified, allowed values + are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Key is the taint key that the toleration applies + to. Empty means match all taint keys. If the key is empty, operator + must be Exists; this combination means to match all values and + all keys. + type: string + operator: + description: Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. Exists + is equivalent to wildcard for value, so that a pod can tolerate + all taints of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period of time the + toleration (which must be of effect NoExecute, otherwise this + field is ignored) tolerates the taint. By default, it is not + set, which means tolerate the taint forever (do not evict). + Zero and negative values will be treated as 0 (evict immediately) + by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise + just a regular string. + type: string + type: array + version: + description: Version of Prometheus to be deployed. + type: string + status: + description: 'Most recent observed status of the Prometheus cluster. Read-only. + Not included when requesting from the apiserver, only from the Prometheus + Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + availableReplicas: + description: Total number of available pods (ready for at least minReadySeconds) + targeted by this Prometheus deployment. + format: int32 + type: integer + paused: + description: Represents whether any actions on the underlaying managed + objects are being performed. Only delete actions will be performed. + type: boolean + replicas: + description: Total number of non-terminated pods targeted by this Prometheus + deployment (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: Total number of unavailable pods targeted by this Prometheus + deployment. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted by this Prometheus + deployment that have the desired version spec. + format: int32 + type: integer + required: + - paused + - replicas + - updatedReplicas + - availableReplicas + - unavailableReplicas + required: + - spec + version: v1 +status: + acceptedNames: + kind: "" + plural: "" + conditions: null diff --git a/manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml b/manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml new file mode 100644 index 00000000..6573b146 --- /dev/null +++ b/manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml @@ -0,0 +1,236 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: servicemonitors.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: ServiceMonitor + plural: servicemonitors + scope: Namespaced + validation: + openAPIV3Schema: + description: ServiceMonitor defines monitoring for a set of services. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: ServiceMonitorSpec contains specification parameters for a + ServiceMonitor. + properties: + endpoints: + description: A list of endpoints allowed as part of this ServiceMonitor. + items: + description: Endpoint defines a scrapeable endpoint serving Prometheus + metrics. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerTokenFile: + description: File to read bearer token for scraping targets. + type: string + honorLabels: + description: HonorLabels chooses the metric's labels on collisions + with target labels. + type: boolean + interval: + description: Interval at which metrics should be scraped + type: string + metricRelabelings: + description: MetricRelabelConfigs to apply to samples before ingestion. + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array + params: + description: Optional HTTP URL parameters + type: object + path: + description: HTTP path to scrape for metrics. + type: string + port: + description: Name of the service port this endpoint refers to. + Mutually exclusive with targetPort. + type: string + scheme: + description: HTTP scheme to use for scraping. + type: string + scrapeTimeout: + description: Timeout after which the scrape is ended + type: string + targetPort: {} + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + type: array + jobLabel: + description: The label to use to retrieve the job name from. + type: string + namespaceSelector: + description: A selector for selecting namespaces either selecting all + namespaces or a list of namespaces. + properties: + any: + description: Boolean describing whether all namespaces are selected + in contrast to a list restricting them. + type: boolean + matchNames: + description: List of namespace names. + items: + type: string + type: array + selector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + targetLabels: + description: TargetLabels transfers labels on the Kubernetes Service + onto the target. + items: + type: string + type: array + required: + - endpoints + - selector + required: + - spec + version: v1 +status: + acceptedNames: + kind: "" + plural: "" + conditions: null diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/0prometheus-operator-cluster-role-binding.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml rename to manifests/0prometheus-operator-cluster-role-binding.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/0prometheus-operator-cluster-role.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-cluster-role.yaml rename to manifests/0prometheus-operator-cluster-role.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml similarity index 94% rename from manifests/prometheus-operator/prometheus-operator-deployment.yaml rename to manifests/0prometheus-operator-deployment.yaml index ac744b2a..b965ec1c 100644 --- a/manifests/prometheus-operator/prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -31,6 +31,8 @@ spec: requests: cpu: 100m memory: 50Mi + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/prometheus-operator/prometheus-operator-service-account.yaml b/manifests/0prometheus-operator-service-account.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-service-account.yaml rename to manifests/0prometheus-operator-service-account.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/0prometheus-operator-service.yaml similarity index 76% rename from manifests/prometheus-operator/prometheus-operator-service.yaml rename to manifests/0prometheus-operator-service.yaml index 8a825387..5231b337 100644 --- a/manifests/prometheus-operator/prometheus-operator-service.yaml +++ b/manifests/0prometheus-operator-service.yaml @@ -1,9 +1,12 @@ apiVersion: v1 kind: Service metadata: + labels: + k8s-app: prometheus-operator name: prometheus-operator namespace: monitoring spec: + clusterIP: None ports: - name: http port: 8080 diff --git a/manifests/alertmanager-main/alertmanager-main.yaml b/manifests/alertmanager-alertmanager.yaml similarity index 69% rename from manifests/alertmanager-main/alertmanager-main.yaml rename to manifests/alertmanager-alertmanager.yaml index 84e72ec5..2a8daa8d 100644 --- a/manifests/alertmanager-main/alertmanager-main.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -6,6 +6,9 @@ metadata: name: main namespace: monitoring spec: + baseImage: quay.io/prometheus/alertmanager + nodeSelector: + beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main version: v0.14.0 diff --git a/manifests/alertmanager-main/alertmanager-main-secret.yaml b/manifests/alertmanager-main/alertmanager-main-secret.yaml deleted file mode 100644 index 4a143fbb..00000000 --- a/manifests/alertmanager-main/alertmanager-main-secret.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== -kind: Secret -metadata: - name: alertmanager-main - namespace: monitoring -type: Opaque diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml new file mode 100644 index 00000000..07155d97 --- /dev/null +++ b/manifests/alertmanager-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +data: + alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo= +kind: Secret +metadata: + name: alertmanager-main + namespace: monitoring +type: Opaque diff --git a/manifests/alertmanager-main/alertmanager-main-service-account.yaml b/manifests/alertmanager-service-account.yaml similarity index 100% rename from manifests/alertmanager-main/alertmanager-main-service-account.yaml rename to manifests/alertmanager-service-account.yaml diff --git a/manifests/alertmanager-main/alertmanager-main-service-monitor.yaml b/manifests/alertmanager-service-monitor.yaml similarity index 100% rename from manifests/alertmanager-main/alertmanager-main-service-monitor.yaml rename to manifests/alertmanager-service-monitor.yaml diff --git a/manifests/alertmanager-main/alertmanager-main-service.yaml b/manifests/alertmanager-service.yaml similarity index 100% rename from manifests/alertmanager-main/alertmanager-main-service.yaml rename to manifests/alertmanager-service.yaml diff --git a/manifests/grafana/grafana-datasources.yaml b/manifests/grafana-dashboard-datasources.yaml similarity index 100% rename from manifests/grafana/grafana-datasources.yaml rename to manifests/grafana-dashboard-datasources.yaml diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests/grafana-dashboard-definitions.yaml similarity index 54% rename from manifests/grafana/grafana-dashboard-definitions.yaml rename to manifests/grafana-dashboard-definitions.yaml index 573281af..8fd4a0d6 100644 --- a/manifests/grafana/grafana-dashboard-definitions.yaml +++ b/manifests/grafana-dashboard-definitions.yaml @@ -1,610 +1,23 @@ apiVersion: v1 data: - deployments-dashboard.json: |- + k8s-cluster-rsrc-use.json: |- { "annotations": { "list": [ ] }, - "editable": false, + "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=\u007e\"$deployment_name.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=\u007e\"$deployment_name.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=\u007e\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=\u007e\"$deployment_name.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_spec_replicas{namespace=\"$deployment_namespace\",deployment=\"$deployment_name\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_deployment_status_replicas_available{namespace=\"$deployment_namespace\",deployment=\"$deployment_name\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Available Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_status_observed_generation{namespace=\"$deployment_namespace\",deployment=\"$deployment_name\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, "height": "250px", "panels": [ { @@ -614,31 +27,1673 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 9, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:ratio", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Network", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_free{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Capacity", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / USE Method / Cluster", + "version": 0 + } + k8s-node-rsrc-use.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 0, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Memory", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Swap IO", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Net", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], @@ -648,39 +1703,12 @@ data: "steppedLine": false, "targets": [ { - "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "expr": "1 - sum(max by (device, node) (node_filesystem_free{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "current replicas", - "refId": "A" - }, - { - "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "available", - "refId": "B" - }, - { - "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "unavailable", - "refId": "C" - }, - { - "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "D" - }, - { - "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "desired", - "refId": "E" + "legendFormat": "Disk", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -688,7 +1716,555 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Replicas", + "title": "Disk Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "node", + "multi": false, + "name": "node", + "options": [ + + ], + "query": "label_values(kube_node_info, node)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / USE Method / Node", + "version": 0 + } + k8s-resources-cluster.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "100px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(node_memory_MemTotal)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total[1m])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", "tooltip": { "shared": true, "sort": 0, @@ -710,7 +2286,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { @@ -719,7 +2295,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] } @@ -727,10 +2303,631 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_rss) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage (w/o cache)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(container_memory_rss) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests by Namespace", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Requests", + "titleSize": "h6" } ], "schemaVersion": 14, @@ -741,23 +2938,1657 @@ data: "templating": { "list": [ { - "allValue": null, "current": { - + "text": "Prometheus", + "value": "Prometheus" }, - "datasource": "prometheus", "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "deployment_namespace", + "label": null, + "name": "datasource", "options": [ ], - "query": "label_values(kube_deployment_metadata_generation, namespace)", - "refresh": 2, + "query": "prometheus", + "refresh": 1, "regex": "", - "sort": 0, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / Compute Resources / Cluster", + "version": 0 + } + k8s-resources-namespace.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[1m])) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\"}) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / Compute Resources / Namespace", + "version": 0 + } + k8s-resources-pod.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, "tagValuesQuery": "", "tags": [ @@ -769,21 +4600,22 @@ data: { "allValue": null, "current": { - + "text": "prod", + "value": "prod" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "Name", + "label": "pod", "multi": false, - "name": "deployment_name", + "name": "pod", "options": [ ], - "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", - "refresh": 2, + "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", + "refresh": 1, "regex": "", - "sort": 0, + "sort": 2, "tagValuesQuery": "", "tags": [ @@ -823,4381 +4655,8 @@ data: "30d" ] }, - "timezone": "browser", - "title": "Deployments", - "version": 0 - } - kubernetes-capacity-planning-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cpu}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Idle CPU", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_load1)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 1m", - "refId": "A" - }, - { - "expr": "sum(node_load5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 5m", - "refId": "B" - }, - { - "expr": "sum(node_load15)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 15m", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "System Load", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory used", - "refId": "A" - }, - { - "expr": "sum(node_memory_Buffers)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "refId": "B" - }, - { - "expr": "sum(node_memory_Cached)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory cached", - "refId": "C" - }, - { - "expr": "sum(node_memory_MemFree)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory free", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_disk_bytes_read[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "read", - "refId": "A" - }, - { - "expr": "sum(rate(node_disk_bytes_written[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "written", - "refId": "B" - }, - { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "io time", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"}) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Disk Space Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes{device!\u007e\"lo\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Received", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(node_network_transmit_bytes{device!\u007e\"lo\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Transmitted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_info)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current Number of Pods", - "refId": "A" - }, - { - "expr": "sum(kube_node_status_capacity_pods)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Maximum Capacity of Pods", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cluster Pod Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 11, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - - ] - }, - "time": { - "from": "now-24h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Capacity Planning", - "version": 0 - } - kubernetes-cluster-health-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{job=\u007e\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Control Plane Components Down", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "Everything UP and healthy", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Alerts Pending", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Crashlooping Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Node Not Ready", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Node Disk Pressure", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Node Memory Pressure", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 9, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kube_node_spec_unschedulable)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Node Unschedulable", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Health", - "version": 0 - } - kubernetes-cluster-status-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{job=\u007e\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Control Plane UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 6, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Alerts Firing", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "Controller Managers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "Schedulers Up", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "count(increase(kube_pod_container_status_restarts{namespace=\u007e\"kube-system|tectonic-system\"}[1h]) > 5)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Crashlooping Control Plane Pods", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "CPU Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 9, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 10, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Filesystem Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 11, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Pod Utilization", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Cluster Status", - "version": 0 - } - kubernetes-control-plane-status-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "API Servers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "Controller Mangers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "Schedulers UP", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum by(instance) (rate(apiserver_request_count{code=\u007e\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "50, 80", - "title": "API Request Error Rate", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "API Request Latency", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 7, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(instance) (rate(apiserver_request_count{code!\u007e\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Error Rate", - "refId": "A" - }, - { - "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Request Rate", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "API Request Rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "End to End Scheduling Latency", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Control Plane Status", - "version": 0 - } - kubernetes-kubelet-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pod_count{instance=\u007e\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Count", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "kubelet_running_pod_count{instance=\u007e\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Pods", - "titleSize": "h4", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "prometheus", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_container_count{instance=\u007e\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Count", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 5, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "kubelet_running_container_count{instance=\u007e\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Containers", - "titleSize": "h4", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "description": "Rate of Kubelet Operations in 5min", - "fill": 1, - "gridPos": { - - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations{instance=\u007e\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operations", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Kubelet", - "titleSize": "h4", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "allValue": null, - "current": { - - }, - "datasource": "prometheus", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ - - ], - "query": "label_values(kubelet_running_pod_count,instance)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubelet", - "version": 0 - } - kubernetes-resource-requests-dashboard.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Allocatable CPU Cores", - "refId": "A" - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested CPU Cores", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Cores", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Allocatable Memory", - "refId": "A" - }, - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested Memory", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubernetes Resource Requests", + "timezone": "utc", + "title": "K8s / Compute Resources / Pod", "version": 0 } nodes.json: |- @@ -5229,7 +4688,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5263,7 +4722,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "expr": "100 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])) * 100)\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{cpu}}", @@ -5317,7 +4776,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5351,21 +4810,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "node_load1{instance=\"$server\"} * 100", + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"} * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 1m", "refId": "A" }, { - "expr": "node_load5{instance=\"$server\"} * 100", + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"} * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 5m", "refId": "B" }, { - "expr": "node_load15{instance=\"$server\"} * 100", + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"} * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 15m", @@ -5433,7 +4892,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5467,28 +4926,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "node_memory_Buffers{instance=\"$server\"}", + "expr": "node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "node_memory_Cached{instance=\"$server\"}", + "expr": "node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "node_memory_MemFree{instance=\"$server\"}", + "expr": "node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5596,7 +5055,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "expr": "(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n) * 100\n /\nnode_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5636,7 +5095,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5677,21 +5136,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5799,7 +5258,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) * 100", + "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_free{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5839,7 +5298,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5873,7 +5332,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "expr": "rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5927,7 +5386,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -5961,7 +5420,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "expr": "rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6025,21 +5484,37 @@ data: ], "templating": { "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, { "allValue": null, "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "server", + "name": "instance", "options": [ ], - "query": "label_values(node_boot_time, instance)", + "query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)", "refresh": 2, "regex": "", "sort": 0, @@ -6086,7 +5561,7 @@ data: "title": "Nodes", "version": 0 } - pods-dashboard.json: |- + pods.json: |- { "annotations": { "list": [ @@ -6115,7 +5590,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -6149,21 +5624,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6231,7 +5706,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -6265,7 +5740,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6333,7 +5808,7 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -6367,7 +5842,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", @@ -6431,12 +5906,28 @@ data: ], "templating": { "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, { "allValue": null, "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "Namespace", @@ -6462,7 +5953,7 @@ data: "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "Pod", @@ -6488,7 +5979,7 @@ data: "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": true, "label": "Container", @@ -6544,7 +6035,7 @@ data: "title": "Pods", "version": 0 } - statefulset-dashboard.json: |- + statefulset.json: |- { "annotations": { "list": [ @@ -6606,7 +6097,7 @@ data: "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, - "postfix": "", + "postfix": "cores", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -6626,7 +6117,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -6685,7 +6176,7 @@ data: "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, - "postfix": "", + "postfix": "GB", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -6705,7 +6196,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -6764,7 +6255,7 @@ data: "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, - "postfix": "", + "postfix": "Bps", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", @@ -6784,7 +6275,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -6878,7 +6369,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_replicas{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -6958,7 +6449,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "min(kube_statefulset_status_replicas_current{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -7038,7 +6529,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_status_observed_generation{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -7118,7 +6609,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -7192,35 +6683,35 @@ data: "steppedLine": false, "targets": [ { - "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas specified", "refId": "A" }, { - "expr": "max(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas created", "refId": "B" }, { - "expr": "min(kube_statefulset_status_replicas_ready{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "ready", "refId": "C" }, { - "expr": "min(kube_statefulset_status_replicas_current{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas of current version", "refId": "D" }, { - "expr": "min(kube_statefulset_status_replicas_updated{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "updated", @@ -7284,6 +6775,22 @@ data: ], "templating": { "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, { "allValue": null, "current": { @@ -7294,11 +6801,11 @@ data: "includeAll": false, "label": "Namespace", "multi": false, - "name": "statefulset_namespace", + "name": "namespace", "options": [ ], - "query": "label_values(kube_statefulset_metadata_generation, namespace)", + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -7320,11 +6827,11 @@ data: "includeAll": false, "label": "Name", "multi": false, - "name": "statefulset_name", + "name": "statefulset", "options": [ ], - "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", namespace=\"$namespace\"}, statefulset)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/grafana/grafana-dashboard-sources.yaml b/manifests/grafana-dashboard-sources.yaml similarity index 100% rename from manifests/grafana/grafana-dashboard-sources.yaml rename to manifests/grafana-dashboard-sources.yaml diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana-deployment.yaml similarity index 86% rename from manifests/grafana/grafana-deployment.yaml rename to manifests/grafana-deployment.yaml index 9d7ae88f..814e98cb 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: quay.io/coreos/monitoring-grafana:5.0.3 + - image: grafana/grafana:5.1.0 name: grafana ports: - containerPort: 3000 @@ -29,13 +29,13 @@ spec: cpu: 100m memory: 100Mi volumeMounts: - - mountPath: /data + - mountPath: /var/lib/grafana name: grafana-storage readOnly: false - - mountPath: /grafana/conf/provisioning/datasources + - mountPath: /etc/grafana/provisioning/datasources name: grafana-datasources readOnly: false - - mountPath: /grafana/conf/provisioning/dashboards + - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false - mountPath: /grafana-dashboard-definitions/0 diff --git a/manifests/grafana/grafana-service-account.yaml b/manifests/grafana-service-account.yaml similarity index 100% rename from manifests/grafana/grafana-service-account.yaml rename to manifests/grafana-service-account.yaml diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana-service.yaml similarity index 100% rename from manifests/grafana/grafana-service.yaml rename to manifests/grafana-service.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml b/manifests/kube-state-metrics-cluster-role-binding.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml rename to manifests/kube-state-metrics-cluster-role-binding.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics-cluster-role.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml rename to manifests/kube-state-metrics-cluster-role.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml similarity index 97% rename from manifests/kube-state-metrics/kube-state-metrics-deployment.yaml rename to manifests/kube-state-metrics-deployment.yaml index bd6d9475..fb2a8b5f 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -89,6 +89,8 @@ spec: requests: cpu: 10m memory: 30Mi + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics-role-binding.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml rename to manifests/kube-state-metrics-role-binding.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-role.yaml b/manifests/kube-state-metrics-role.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-role.yaml rename to manifests/kube-state-metrics-role.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml b/manifests/kube-state-metrics-service-account.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-service-account.yaml rename to manifests/kube-state-metrics-service-account.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-service-monitor.yaml b/manifests/kube-state-metrics-service-monitor.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-service-monitor.yaml rename to manifests/kube-state-metrics-service-monitor.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-service.yaml b/manifests/kube-state-metrics-service.yaml similarity index 94% rename from manifests/kube-state-metrics/kube-state-metrics-service.yaml rename to manifests/kube-state-metrics-service.yaml index 3e88b562..84927af3 100644 --- a/manifests/kube-state-metrics/kube-state-metrics-service.yaml +++ b/manifests/kube-state-metrics-service.yaml @@ -6,6 +6,7 @@ metadata: name: kube-state-metrics namespace: monitoring spec: + clusterIP: None ports: - name: https-main port: 8443 diff --git a/manifests/node-exporter/node-exporter-cluster-role-binding.yaml b/manifests/node-exporter-cluster-role-binding.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-cluster-role-binding.yaml rename to manifests/node-exporter-cluster-role-binding.yaml diff --git a/manifests/node-exporter/node-exporter-cluster-role.yaml b/manifests/node-exporter-cluster-role.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-cluster-role.yaml rename to manifests/node-exporter-cluster-role.yaml diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml similarity index 90% rename from manifests/node-exporter/node-exporter-daemonset.yaml rename to manifests/node-exporter-daemonset.yaml index 1284e93d..8488735c 100644 --- a/manifests/node-exporter/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -50,10 +50,15 @@ spec: requests: cpu: 10m memory: 20Mi + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 serviceAccountName: node-exporter + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master volumes: - hostPath: path: /proc diff --git a/manifests/node-exporter/node-exporter-service-account.yaml b/manifests/node-exporter-service-account.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-service-account.yaml rename to manifests/node-exporter-service-account.yaml diff --git a/manifests/node-exporter/node-exporter-service-monitor.yaml b/manifests/node-exporter-service-monitor.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-service-monitor.yaml rename to manifests/node-exporter-service-monitor.yaml diff --git a/manifests/node-exporter/node-exporter-service.yaml b/manifests/node-exporter-service.yaml similarity index 92% rename from manifests/node-exporter/node-exporter-service.yaml rename to manifests/node-exporter-service.yaml index 101a9769..1d728d76 100644 --- a/manifests/node-exporter/node-exporter-service.yaml +++ b/manifests/node-exporter-service.yaml @@ -6,6 +6,7 @@ metadata: name: node-exporter namespace: monitoring spec: + clusterIP: None ports: - name: https port: 9100 diff --git a/manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml b/manifests/prometheus-cluster-role-binding.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-cluster-role-binding.yaml rename to manifests/prometheus-cluster-role-binding.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml b/manifests/prometheus-cluster-role.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-cluster-role.yaml rename to manifests/prometheus-cluster-role.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml deleted file mode 100644 index 0c03de56..00000000 --- a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml +++ /dev/null @@ -1,590 +0,0 @@ -apiVersion: v1 -data: - alertmanager.rules.yaml: | - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) - GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", - "alertmanager-$1", "alertmanager", "(.*)") != 1 - for: 5m - labels: - severity: critical - annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. - summary: Configuration out of sync - - alert: AlertmanagerDownOrMissing - expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", - "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 - for: 5m - labels: - severity: warning - annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. - summary: Alertmanager down or missing - - alert: AlertmanagerFailedReload - expr: alertmanager_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Alertmanager's configuration reload failed - etcd3.rules.yaml: | - groups: - - name: ./etcd3.rules - rules: - - alert: InsufficientMembers - expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - for: 3m - labels: - severity: critical - annotations: - description: If one more etcd member goes down the cluster will be unavailable - summary: etcd cluster insufficient members - - alert: NoLeader - expr: etcd_server_has_leader{job="etcd"} == 0 - for: 1m - labels: - severity: critical - annotations: - description: etcd member {{ $labels.instance }} has no leader - summary: etcd member has no leader - - alert: HighNumberOfLeaderChanges - expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader - changes within the last hour - summary: a high number of leader changes within the etcd cluster are happening - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical - annotations: - description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method - }} are slow - summary: slow gRPC requests - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HTTPRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow - summary: slow HTTP requests - - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} member communication with - {{ $labels.To }} is slow - summary: etcd member communication is slow - - alert: HighNumberOfFailedProposals - expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal - failures within the last hour - summary: a high number of proposals within the etcd cluster are failing - - alert: HighFsyncDurations - expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) - > 0.5 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} fync durations are high - summary: high fsync durations - - alert: HighCommitDurations - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) - > 0.25 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} commit durations are high - summary: high commit durations - general.rules.yaml: | - groups: - - name: general.rules - rules: - - alert: TargetDown - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' - summary: Targets are down - - alert: DeadMansSwitch - expr: vector(1) - labels: - severity: none - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - - record: fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[10m], 3600) > 1 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted - kube-controller-manager.rules.yaml: | - groups: - - name: kube-controller-manager.rules - rules: - - alert: K8SControllerManagerDown - expr: absent(up{job="kube-controller-manager"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S controller manager. Deployments and replication - controllers are not making progress. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down - kube-scheduler.rules.yaml: | - groups: - - name: kube-scheduler.rules - rules: - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - alert: K8SSchedulerDown - expr: absent(up{job="kube-scheduler"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S scheduler. New pods are not being assigned - to nodes. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler - summary: Scheduler is down - kube-state-metrics.rules.yaml: | - groups: - - name: kube-state-metrics.rules - rules: - - alert: DeploymentGenerationMismatch - expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation - for: 15m - labels: - severity: warning - annotations: - description: Observed deployment generation does not match expected one for - deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment is outdated - - alert: DeploymentReplicasNotUpdated - expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) - or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) - unless (kube_deployment_spec_paused == 1) - for: 15m - labels: - severity: warning - annotations: - description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment replicas are outdated - - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled - * 100 < 100 - for: 15m - labels: - severity: warning - annotations: - description: Only {{$value}}% of desired pods scheduled and ready for daemon - set {{$labels.namespace}}/{{$labels.daemonset}} - summary: DaemonSet is missing pods - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: DaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly - - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 - for: 10m - labels: - severity: warning - annotations: - description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} - times within the last hour - summary: Pod is restarting frequently - kubelet.rules.yaml: | - groups: - - name: kubelet.rules - rules: - - alert: K8SNodeNotReady - expr: kube_node_status_condition{condition="Ready",status="true"} == 0 - for: 1h - labels: - severity: warning - annotations: - description: The Kubelet on {{ $labels.node }} has not checked in with the API, - or has set itself to NotReady, for more than an hour - summary: Node status is NotReady - - alert: K8SManyNodesNotReady - expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) - > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == - 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 - for: 1m - labels: - severity: critical - annotations: - description: '{{ $value }}% of Kubernetes nodes are not ready' - - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 - for: 1h - labels: - severity: warning - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Prometheus failed to scrape - - alert: K8SKubeletDown - expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) - * 100 > 10 - for: 1h - labels: - severity: critical - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets - have disappeared from service discovery. - summary: Many Kubelets cannot be scraped - - alert: K8SKubeletTooManyPods - expr: kubelet_running_pod_count > 100 - for: 10m - labels: - severity: warning - annotations: - description: Kubelet {{$labels.instance}} is running {{$value}} pods, close - to the limit of 110 - summary: Kubelet is close to pod limit - kubernetes.rules.yaml: | - groups: - - name: kubernetes.rules - rules: - - record: pod_name:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (pod_name) - - record: pod_name:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: pod_name:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - BY (pod_name) - - record: pod_name:container_fs_usage_bytes:sum - expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: namespace:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) - - record: namespace:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) - - record: namespace:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) - BY (namespace) - - record: cluster:memory_usage:ratio - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (cluster) / sum(machine_memory_bytes) BY (cluster) - - record: cluster:container_spec_cpu_shares:ratio - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 - / sum(machine_cpu_cores) - - record: cluster:container_cpu_usage:ratio - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - / sum(machine_cpu_cores) - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.99" - - record: apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.9" - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.5" - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 1 - for: 10m - labels: - severity: warning - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 4 - for: 10m - labels: - severity: critical - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 2 - for: 10m - labels: - severity: warning - annotations: - description: API server returns errors for {{ $value }}% of requests - summary: API server request errors - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 5 - for: 10m - labels: - severity: critical - annotations: - description: API server returns errors for {{ $value }}% of requests - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 20m - labels: - severity: critical - annotations: - description: No API servers are reachable or all have disappeared from service - discovery - summary: No API servers are reachable - - - alert: K8sCertificateExpirationNotice - labels: - severity: warning - annotations: - description: Kubernetes API Certificate is expiring soon (less than 7 days) - summary: Kubernetes API Certificate is expiering soon - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - - - alert: K8sCertificateExpirationNotice - labels: - severity: critical - annotations: - description: Kubernetes API Certificate is expiring in less than 1 day - summary: Kubernetes API Certificate is expiering - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 - node.rules.yaml: | - groups: - - name: node.rules - rules: - - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) - BY (instance) - - record: instance:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - BY (instance) - - record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - - record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) - GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - - alert: NodeExporterDown - expr: absent(up{job="node-exporter"} == 1) - for: 10m - labels: - severity: warning - annotations: - description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery - summary: Prometheus could not scrape a node-exporter - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 - for: 30m - labels: - severity: warning - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 - for: 10m - labels: - severity: critical - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours - prometheus.rules.yaml: "groups:\n- name: prometheus.rules\n rules:\n - alert: - PrometheusConfigReloadFailed\n expr: prometheus_config_last_reload_successful - == 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: - Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}\n - \ summary: Reloading Promehteus' configuration failed\n\n - alert: PrometheusNotificationQueueRunningFull\n - \ expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > - prometheus_notifications_queue_capacity\n for: 10m\n labels:\n severity: - warning\n annotations:\n description: Prometheus' alert notification queue - is running full for {{$labels.namespace}}/{{\n $labels.pod}}\n summary: - Prometheus' alert notification queue is running full \n\n - alert: PrometheusErrorSendingAlerts\n - \ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n - \ > 0.01\n for: 10m\n labels:\n severity: warning\n annotations:\n - \ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n - \ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary: - Errors while sending alert from Prometheus\n\n - alert: PrometheusErrorSendingAlerts\n - \ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n - \ > 0.03\n for: 10m\n labels:\n severity: critical\n annotations:\n - \ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n - \ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary: - Errors while sending alerts from Prometheus\n\n - alert: PrometheusNotConnectedToAlertmanagers\n - \ expr: prometheus_notifications_alertmanagers_discovered < 1\n for: 10m\n - \ labels:\n severity: warning\n annotations:\n description: Prometheus - {{ $labels.namespace }}/{{ $labels.pod}} is not connected\n to any Alertmanagers\n - \ summary: Prometheus is not connected to any Alertmanagers\n\n - alert: - PrometheusTSDBReloadsFailing\n expr: increase(prometheus_tsdb_reloads_failures_total[2h]) - > 0\n for: 12h\n labels:\n severity: warning\n annotations:\n description: - '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}\n reload - failures over the last four hours.'\n summary: Prometheus has issues reloading - data blocks from disk\n\n - alert: PrometheusTSDBCompactionsFailing\n expr: - increase(prometheus_tsdb_compactions_failed_total[2h]) > 0\n for: 12h\n labels:\n - \ severity: warning\n annotations:\n description: '{{$labels.job}} - at {{$labels.instance}} had {{$value | humanize}}\n compaction failures - over the last four hours.'\n summary: Prometheus has issues compacting sample - blocks\n\n - alert: PrometheusTSDBWALCorruptions\n expr: tsdb_wal_corruptions_total - > 0\n for: 4h\n labels:\n severity: warning\n annotations:\n description: - '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead\n log - (WAL).'\n summary: Prometheus write-ahead log is corrupted\n\n - alert: - PrometheusNotIngestingSamples\n expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) - <= 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: - \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.\"\n - \ summary: \"Prometheus isn't ingesting samples\"\n\n - alert: PrometheusTargetScapesDuplicate\n - \ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) - > 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description: - \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate - timestamps but different values\"\n summary: Prometheus has many samples - rejected\n" -kind: ConfigMap -metadata: - labels: - prometheus: k8s - role: alert-rules - name: prometheus-k8s-rules - namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s.yaml b/manifests/prometheus-prometheus.yaml similarity index 85% rename from manifests/prometheus-k8s/prometheus-k8s.yaml rename to manifests/prometheus-prometheus.yaml index 324d96c7..b7fe9f25 100644 --- a/manifests/prometheus-k8s/prometheus-k8s.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -11,6 +11,9 @@ spec: - name: alertmanager-main namespace: monitoring port: web + baseImage: quay.io/prometheus/prometheus + nodeSelector: + beta.kubernetes.io/os: linux replicas: 2 resources: requests: diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml b/manifests/prometheus-role-binding-config.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml rename to manifests/prometheus-role-binding-config.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml b/manifests/prometheus-role-binding-default.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-binding-default.yaml rename to manifests/prometheus-role-binding-default.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml b/manifests/prometheus-role-binding-kube-system.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-binding-kube-system.yaml rename to manifests/prometheus-role-binding-kube-system.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml b/manifests/prometheus-role-binding-namespace.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-binding-namespace.yaml rename to manifests/prometheus-role-binding-namespace.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-config.yaml b/manifests/prometheus-role-config.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-config.yaml rename to manifests/prometheus-role-config.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-default.yaml b/manifests/prometheus-role-default.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-default.yaml rename to manifests/prometheus-role-default.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml b/manifests/prometheus-role-kube-system.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-kube-system.yaml rename to manifests/prometheus-role-kube-system.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml b/manifests/prometheus-role-namespace.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-role-namespace.yaml rename to manifests/prometheus-role-namespace.yaml diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml new file mode 100644 index 00000000..852a3362 --- /dev/null +++ b/manifests/prometheus-rules.yaml @@ -0,0 +1,166 @@ +apiVersion: v1 +data: + all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\": + |\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m])) + by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n + \ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"}) + by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n + \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", + image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name) + group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, + \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n + \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"}) + by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n + \ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", + \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n + \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}) + by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, + \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n + \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"}) + by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, + \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n- + \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) + by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n + \ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", + \"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n + \ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1 + - avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\": + \":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n + \ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace, + pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\": + \"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n + \ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n + \ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n + \ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"} + + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n + \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\": + \":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\": + |\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n + \ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n + \ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\": + \"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n + \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\": + \":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum + by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"} + + node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n + \ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node) + (\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace, + pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": + \"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum + / node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n + \ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n + \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace, + pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": + \"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) + / 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n + \ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) + / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\": + |\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) + / 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n + \ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) + / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\": + |\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])) + +\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n + \ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum + by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]) + +\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\": + |\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])) + +\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n + \ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum + by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) + +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n + \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n + \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-apps\"\n + \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n + \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n + \ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) + > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\": + \"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n + \ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", + phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": + \"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\": + \n \"message\": \"Deployment {{ $labels.namespace }}/{{ labels.deployment + }} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n + \ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\": + \"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n + \ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n + \ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": + \"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n + \ \"annotations\": \n \"message\": \"Overcommited CPU resource requests + on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n + \ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1) + / count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\": + \"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": + \"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n + \ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n + \ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n + \ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": + \n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\": + \n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n + \ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", + resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n > + 1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": + \"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n + \ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n + \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n + \ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": + \"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\" + $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n + \ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n + \ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\", + type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\": + \"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\": + \"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\": + \"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace + {{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\": + |\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n + \ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\": + \"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n + \ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent + volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace + }} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h], + 4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\": + \"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\": + \"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node + }} has been unready for more than an hour\"\n \"expr\": |\n max(kube_node_status_ready{job=\"kube-state-metrics\", + condition=\"false\"} == 1) BY (node)\n \"for\": \"1h\"\n \"labels\": \n + \ \"severity\": \"warning\"\n - \"alert\": \"KubeVersionMismatch\"\n \"annotations\": + \n \"message\": \"There are {{ $value }} different versions of Kubernetes + components running.\"\n \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"}) + by (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\": + \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": + \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing + {{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m])) + by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m])) + by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": + \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": + \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing + {{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m])) + by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": + \"warning\"" +kind: ConfigMap +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-k8s-rules + namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-account.yaml b/manifests/prometheus-service-account.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-account.yaml rename to manifests/prometheus-service-account.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus-service-monitor-apiserver.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-apiserver.yaml rename to manifests/prometheus-service-monitor-apiserver.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml b/manifests/prometheus-service-monitor-core-dns.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-coredns.yaml rename to manifests/prometheus-service-monitor-core-dns.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus-service-monitor-kube-controller-manager.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-controller-manager.yaml rename to manifests/prometheus-service-monitor-kube-controller-manager.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus-service-monitor-kube-scheduler.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-kube-scheduler.yaml rename to manifests/prometheus-service-monitor-kube-scheduler.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus-service-monitor-kubelet.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-kubelet.yaml rename to manifests/prometheus-service-monitor-kubelet.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-service-monitor.yaml b/manifests/prometheus-service-monitor-prometheus-operator.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-service-monitor.yaml rename to manifests/prometheus-service-monitor-prometheus-operator.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus-service-monitor-prometheus.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service-monitor-prometheus.yaml rename to manifests/prometheus-service-monitor-prometheus.yaml diff --git a/manifests/prometheus-k8s/prometheus-k8s-service.yaml b/manifests/prometheus-service.yaml similarity index 100% rename from manifests/prometheus-k8s/prometheus-k8s-service.yaml rename to manifests/prometheus-service.yaml From 3388f96afe68dada6aeafa1e2304da42d37b9279 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 30 Apr 2018 19:48:14 +0200 Subject: [PATCH 245/638] Address comments --- .gitignore | 2 + Makefile | 10 +- README.md | 100 ++++++++++++------ build.sh | 7 ++ example.jsonnet | 13 +++ examples/minikube.jsonnet | 17 +++ hack/cluster-monitoring/deploy | 38 ------- hack/cluster-monitoring/teardown | 4 - hack/scripts/build-jsonnet.sh | 21 ---- hack/scripts/kube-prometheus-base.jsonnet | 12 --- hack/scripts/kube-prometheus-minikube.jsonnet | 16 --- .../kube-prometheus/kube-prometheus.libsonnet | 9 +- jsonnetfile.json | 14 +++ manifests/00namespace-namespace.yaml | 4 + ...alertmanagerCustomResourceDefinition.yaml} | 0 ...-0prometheusCustomResourceDefinition.yaml} | 0 ...rvicemonitorCustomResourceDefinition.yaml} | 0 ... => 0prometheus-operator-clusterRole.yaml} | 0 ...ometheus-operator-clusterRoleBinding.yaml} | 0 ... 0prometheus-operator-serviceAccount.yaml} | 0 ....yaml => alertmanager-serviceAccount.yaml} | 0 ....yaml => alertmanager-serviceMonitor.yaml} | 0 ...yaml => grafana-dashboardDatasources.yaml} | 0 ...yaml => grafana-dashboardDefinitions.yaml} | 0 ...ces.yaml => grafana-dashboardSources.yaml} | 0 ...count.yaml => grafana-serviceAccount.yaml} | 0 ...ml => kube-state-metrics-clusterRole.yaml} | 0 ...ube-state-metrics-clusterRoleBinding.yaml} | 0 ...ml => kube-state-metrics-roleBinding.yaml} | 0 ...=> kube-state-metrics-serviceAccount.yaml} | 0 ...=> kube-state-metrics-serviceMonitor.yaml} | 0 ...le.yaml => node-exporter-clusterRole.yaml} | 0 ... => node-exporter-clusterRoleBinding.yaml} | 0 ...yaml => node-exporter-serviceAccount.yaml} | 0 ...yaml => node-exporter-serviceMonitor.yaml} | 0 ...-role.yaml => prometheus-clusterRole.yaml} | 0 ...aml => prometheus-clusterRoleBinding.yaml} | 0 ...yaml => prometheus-roleBindingConfig.yaml} | 0 ...aml => prometheus-roleBindingDefault.yaml} | 0 ... => prometheus-roleBindingKubeSystem.yaml} | 0 ...l => prometheus-roleBindingNamespace.yaml} | 0 ...config.yaml => prometheus-roleConfig.yaml} | 0 ...fault.yaml => prometheus-roleDefault.yaml} | 0 ...em.yaml => prometheus-roleKubeSystem.yaml} | 0 ...ace.yaml => prometheus-roleNamespace.yaml} | 0 ...nt.yaml => prometheus-serviceAccount.yaml} | 0 ...> prometheus-serviceMonitorApiserver.yaml} | 0 ... => prometheus-serviceMonitorCoreDNS.yaml} | 0 ...-serviceMonitorKubeControllerManager.yaml} | 0 ...ometheus-serviceMonitorKubeScheduler.yaml} | 0 ... => prometheus-serviceMonitorKubelet.yaml} | 0 ... prometheus-serviceMonitorPrometheus.yaml} | 0 ...eus-serviceMonitorPrometheusOperator.yaml} | 0 53 files changed, 138 insertions(+), 129 deletions(-) create mode 100755 build.sh create mode 100644 example.jsonnet create mode 100644 examples/minikube.jsonnet delete mode 100755 hack/cluster-monitoring/deploy delete mode 100755 hack/cluster-monitoring/teardown delete mode 100755 hack/scripts/build-jsonnet.sh delete mode 100644 hack/scripts/kube-prometheus-base.jsonnet delete mode 100644 hack/scripts/kube-prometheus-minikube.jsonnet create mode 100644 jsonnetfile.json create mode 100644 manifests/00namespace-namespace.yaml rename manifests/{0prometheus-operator-0alertmanager-custom-resource-definition.yaml => 0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml} (100%) rename manifests/{0prometheus-operator-0prometheus-custom-resource-definition.yaml => 0prometheus-operator-0prometheusCustomResourceDefinition.yaml} (100%) rename manifests/{0prometheus-operator-0servicemonitor-custom-resource-definition.yaml => 0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml} (100%) rename manifests/{0prometheus-operator-cluster-role.yaml => 0prometheus-operator-clusterRole.yaml} (100%) rename manifests/{0prometheus-operator-cluster-role-binding.yaml => 0prometheus-operator-clusterRoleBinding.yaml} (100%) rename manifests/{0prometheus-operator-service-account.yaml => 0prometheus-operator-serviceAccount.yaml} (100%) rename manifests/{alertmanager-service-account.yaml => alertmanager-serviceAccount.yaml} (100%) rename manifests/{alertmanager-service-monitor.yaml => alertmanager-serviceMonitor.yaml} (100%) rename manifests/{grafana-dashboard-datasources.yaml => grafana-dashboardDatasources.yaml} (100%) rename manifests/{grafana-dashboard-definitions.yaml => grafana-dashboardDefinitions.yaml} (100%) rename manifests/{grafana-dashboard-sources.yaml => grafana-dashboardSources.yaml} (100%) rename manifests/{grafana-service-account.yaml => grafana-serviceAccount.yaml} (100%) rename manifests/{kube-state-metrics-cluster-role.yaml => kube-state-metrics-clusterRole.yaml} (100%) rename manifests/{kube-state-metrics-cluster-role-binding.yaml => kube-state-metrics-clusterRoleBinding.yaml} (100%) rename manifests/{kube-state-metrics-role-binding.yaml => kube-state-metrics-roleBinding.yaml} (100%) rename manifests/{kube-state-metrics-service-account.yaml => kube-state-metrics-serviceAccount.yaml} (100%) rename manifests/{kube-state-metrics-service-monitor.yaml => kube-state-metrics-serviceMonitor.yaml} (100%) rename manifests/{node-exporter-cluster-role.yaml => node-exporter-clusterRole.yaml} (100%) rename manifests/{node-exporter-cluster-role-binding.yaml => node-exporter-clusterRoleBinding.yaml} (100%) rename manifests/{node-exporter-service-account.yaml => node-exporter-serviceAccount.yaml} (100%) rename manifests/{node-exporter-service-monitor.yaml => node-exporter-serviceMonitor.yaml} (100%) rename manifests/{prometheus-cluster-role.yaml => prometheus-clusterRole.yaml} (100%) rename manifests/{prometheus-cluster-role-binding.yaml => prometheus-clusterRoleBinding.yaml} (100%) rename manifests/{prometheus-role-binding-config.yaml => prometheus-roleBindingConfig.yaml} (100%) rename manifests/{prometheus-role-binding-default.yaml => prometheus-roleBindingDefault.yaml} (100%) rename manifests/{prometheus-role-binding-kube-system.yaml => prometheus-roleBindingKubeSystem.yaml} (100%) rename manifests/{prometheus-role-binding-namespace.yaml => prometheus-roleBindingNamespace.yaml} (100%) rename manifests/{prometheus-role-config.yaml => prometheus-roleConfig.yaml} (100%) rename manifests/{prometheus-role-default.yaml => prometheus-roleDefault.yaml} (100%) rename manifests/{prometheus-role-kube-system.yaml => prometheus-roleKubeSystem.yaml} (100%) rename manifests/{prometheus-role-namespace.yaml => prometheus-roleNamespace.yaml} (100%) rename manifests/{prometheus-service-account.yaml => prometheus-serviceAccount.yaml} (100%) rename manifests/{prometheus-service-monitor-apiserver.yaml => prometheus-serviceMonitorApiserver.yaml} (100%) rename manifests/{prometheus-service-monitor-core-dns.yaml => prometheus-serviceMonitorCoreDNS.yaml} (100%) rename manifests/{prometheus-service-monitor-kube-controller-manager.yaml => prometheus-serviceMonitorKubeControllerManager.yaml} (100%) rename manifests/{prometheus-service-monitor-kube-scheduler.yaml => prometheus-serviceMonitorKubeScheduler.yaml} (100%) rename manifests/{prometheus-service-monitor-kubelet.yaml => prometheus-serviceMonitorKubelet.yaml} (100%) rename manifests/{prometheus-service-monitor-prometheus.yaml => prometheus-serviceMonitorPrometheus.yaml} (100%) rename manifests/{prometheus-service-monitor-prometheus-operator.yaml => prometheus-serviceMonitorPrometheusOperator.yaml} (100%) diff --git a/.gitignore b/.gitignore index 0887fe6e..133fdf90 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ tmp/ minikube-manifests/ +jsonnetfile.lock.json +vendor/ diff --git a/Makefile b/Makefile index 90736d61..a7903cf8 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,9 @@ -.PHONY: image - image: docker build -f ../../scripts/jsonnet/Dockerfile -t po-jsonnet ../../ generate: image @echo ">> Compiling assets and generating Kubernetes manifests" - docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v `pwd`:/go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw + docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make crdtojsonnet generate-raw crdtojsonnet: cat ../../example/prometheus-operator-crd/alertmanager.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet @@ -13,5 +11,7 @@ crdtojsonnet: cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet generate-raw: - cd jsonnet/kube-prometheus; jb install - ./hack/scripts/build-jsonnet.sh hack/scripts/kube-prometheus-base.jsonnet manifests + jb install + ./build.sh + +.PHONY: image generate crdtojsonnet generate-raw diff --git a/README.md b/README.md index ac4497d2..3d1d27f8 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ $ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 - ## Quickstart -Although this project is intended to be used as a library, a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content our quickly. +Although this project is intended to be used as a library, a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content out quickly. Simply create the stack: @@ -55,47 +55,44 @@ $ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonn You may wish to not use ksonnet and simply render the generated manifests to files on disk, this can be done with: -[embedmd]:# (hack/scripts/kube-prometheus-base.jsonnet) +[embedmd]:# (example.jsonnet) ```jsonnet -local kp = (import "kube-prometheus/kube-prometheus.libsonnet") + { - _config+:: { - namespace: "monitoring", - } +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, }; -{["0prometheus-operator-"+name+".yaml"]: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator)} + -{["node-exporter-"+name+".yaml"]: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter)} + -{["kube-state-metrics-"+name+".yaml"]: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics)} + -{["alertmanager-"+name+".yaml"]: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager)} + -{["prometheus-"+name+".yaml"]: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus)} + -{["grafana-"+name+".yaml"]: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana)} +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` -This renders all manifests in a json structure of `{filename: manifest-content}`. To split this into files on disk use: +This renders all manifests in a json structure of `{filename: manifest-content}`. -> Note you need `jsonnet`, `jq`, `sed`, `tr` and `gojsonyaml` (`go get github.com/brancz/gojsontoyaml`) installed. +### Compiling -```bash -jsonnet -J vendor example.jsonnet > tmp.json +To compile the above and get each manifest in a separate file on disk use the following script: -files=$(jq -r 'keys[]' tmp.json) +[embedmd]:# (build.sh) +```sh +#!/usr/bin/env bash +set -e +set -x -for file in ${files}; do - # prepare directory - dir=$(dirname "${file}") - path="${dir}" - mkdir -p ${path} + # optional, but we would like to generate yaml, not json +jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm $1' -- {} - # covert file name to snake case with dashes - fullfile=$(echo ${file} | sed -r 's/([a-z0-9])([A-Z])/\1-\L\2/g' | tr '[:upper:]' '[:lower:]') - - # write each value to the path in key; convert multiple times to prettify yaml - jq -r ".[\"${file}\"]" tmp.json | gojsontoyaml -yamltojson | gojsontoyaml > "${fullfile}" -done - -rm tmp.json ``` +> Note you need `jsonnet` and `gojsonyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. + +This script reads each key of the generated json and uses that as the file name, and writes the value of that key to that file. + ## Configuration A hidden `_config` field is located at the top level of the object this library provides. These are the available fields with their respective default values: @@ -145,14 +142,28 @@ Jsonnet is a turing complete language, any logic can be reflected in it. It also A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm]() and [bootkube]() clusters there are mixins available to easily configure these: kubeadm: + [embedmd]:# (examples/kubeadm.jsonnet) +```jsonnet +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-kubeadm.libsonnet") +``` bootkube: + [embedmd]:# (examples/bootkube.jsonnet) +```jsonnet +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-bootkube.libsonnet") +``` Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: [embedmd]:# (examples/node-ports.jsonnet) +```jsonnet +(import "kube-prometheus/kube-prometheus.libsonnet") + +(import "kube-prometheus/kube-prometheus-node-ports.libsonnet") +``` For example the name of the `Prometheus` object provided by this library can be overridden: @@ -179,7 +190,34 @@ local daemonset = k.apps.v1beta2.daemonSet; ((import "kube-prometheus/kube-prometheus.libsonnet") + { nodeExporter+: { daemonset+: - daemonset.mixin.metadata.withNamespace("my-custom-namespace") + + daemonset.mixin.metadata.withNamespace("my-custom-namespace") } }).nodeExporter.daemonset ``` + +## Example + +To use an easy to reproduce example, let's take the minikube setup as demonstrated in [prerequisites](#Prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: + +> Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. + +[embedmd]:# (examples/minikube.jsonnet) +```jsonnet +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..cefa57e8 --- /dev/null +++ b/build.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -e +set -x + + # optional, but we would like to generate yaml, not json +jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm $1' -- {} + diff --git a/example.jsonnet b/example.jsonnet new file mode 100644 index 00000000..1d36eb1f --- /dev/null +++ b/example.jsonnet @@ -0,0 +1,13 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet new file mode 100644 index 00000000..ed1a05c4 --- /dev/null +++ b/examples/minikube.jsonnet @@ -0,0 +1,17 @@ +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy deleted file mode 100755 index 6be94e01..00000000 --- a/hack/cluster-monitoring/deploy +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u -# print each command before executing it -set -x - -manifest_prefix=${1-./manifests} - -kubectl create namespace monitoring - -find ${manifest_prefix}/prometheus-operator/ -type f ! -name service-monitor.yaml -exec kubectl apply -f {} \; - -# Wait for CRDs to be ready. -printf "Waiting for Operator to register custom resource definitions..." -until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kubectl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kubectl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kubectl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kubectl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kubectl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -echo "done!" - -# need to ensure that ServiceMonitors are registered before we can create the prometheus-operator ServiceMonitor -kubectl apply -f ${manifest_prefix}/prometheus-operator/service-monitor.yaml - -kubectl apply -f ${manifest_prefix}/node-exporter/ -kubectl apply -f ${manifest_prefix}/kube-state-metrics/ -find ${manifest_prefix}/grafana/ -type f ! -name dashboard-definitions.yaml -exec kubectl apply -f {} \; - -# kubectl apply wants to put the previous version in an annotation, which is too large, therefore create instead of apply -kubectl create -f ${manifest_prefix}/grafana/dashboard-definitions.yaml -kubectl apply -f ${manifest_prefix}/prometheus/ -kubectl apply -f ${manifest_prefix}/alertmanager/ - diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown deleted file mode 100755 index 0ef9a6b3..00000000 --- a/hack/cluster-monitoring/teardown +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -kubectl delete namespace monitoring - diff --git a/hack/scripts/build-jsonnet.sh b/hack/scripts/build-jsonnet.sh deleted file mode 100755 index 7a754e6c..00000000 --- a/hack/scripts/build-jsonnet.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -set -e -set -x - -jsonnet="${1-kube-prometheus.jsonnet}" -prefix="${2-manifests}" -json="tmp/manifests.json" - -rm -rf ${prefix} -mkdir -p $(dirname "${json}") -jsonnet -J jsonnet/kube-prometheus/vendor -J jsonnet ${jsonnet} > ${json} - -files=$(jq -r 'keys[]' ${json}) - -for file in ${files}; do - dir=$(dirname "${file}") - path="${prefix}/${dir}" - mkdir -p ${path} - fullfile=$(echo ${file} | sed -r 's/([a-z0-9])([A-Z])/\1-\L\2/g' | tr '[:upper:]' '[:lower:]') - jq -r ".[\"${file}\"]" ${json} | gojsontoyaml -yamltojson | gojsontoyaml > "${prefix}/${fullfile}" -done diff --git a/hack/scripts/kube-prometheus-base.jsonnet b/hack/scripts/kube-prometheus-base.jsonnet deleted file mode 100644 index 84eb3c29..00000000 --- a/hack/scripts/kube-prometheus-base.jsonnet +++ /dev/null @@ -1,12 +0,0 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, -}; - -{ ['0prometheus-operator-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name + '.yaml']: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name + '.yaml']: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name + '.yaml']: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name + '.yaml']: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana) } diff --git a/hack/scripts/kube-prometheus-minikube.jsonnet b/hack/scripts/kube-prometheus-minikube.jsonnet deleted file mode 100644 index 9a6fff17..00000000 --- a/hack/scripts/kube-prometheus-minikube.jsonnet +++ /dev/null @@ -1,16 +0,0 @@ -local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + - (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - { - _config+:: { - namespace: 'monitoring', - }, - }; - -{ ['0prometheus-operator-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name + '.yaml']: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name + '.yaml']: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name + '.yaml']: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name + '.yaml']: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 36eae76d..e79b7567 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -6,9 +6,14 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; (import 'alertmanager/alertmanager.libsonnet') + (import 'prometheus-operator/prometheus-operator.libsonnet') + (import 'prometheus/prometheus.libsonnet') + -(import 'kubernetes-mixin/mixin.libsonnet') + -{ +(import 'kubernetes-mixin/mixin.libsonnet') + { + kubePrometheus+:: { + namespace: k.core.v1.namespace.new($._config.namespace), + }, +} + { _config+:: { + namespace: 'default', + kubeStateMetricsSelector: 'job="kube-state-metrics"', cadvisorSelector: 'job="kubelet"', nodeExporterSelector: 'job="node-exporter"', diff --git a/jsonnetfile.json b/jsonnetfile.json new file mode 100644 index 00000000..b4ebb0f2 --- /dev/null +++ b/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "kube-prometheus", + "source": { + "git": { + "remote": "../../", + "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" + } + }, + "version": "." + } + ] +} \ No newline at end of file diff --git a/manifests/00namespace-namespace.yaml b/manifests/00namespace-namespace.yaml new file mode 100644 index 00000000..d3252360 --- /dev/null +++ b/manifests/00namespace-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring diff --git a/manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml similarity index 100% rename from manifests/0prometheus-operator-0alertmanager-custom-resource-definition.yaml rename to manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml diff --git a/manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml similarity index 100% rename from manifests/0prometheus-operator-0prometheus-custom-resource-definition.yaml rename to manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml diff --git a/manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml similarity index 100% rename from manifests/0prometheus-operator-0servicemonitor-custom-resource-definition.yaml rename to manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml diff --git a/manifests/0prometheus-operator-cluster-role.yaml b/manifests/0prometheus-operator-clusterRole.yaml similarity index 100% rename from manifests/0prometheus-operator-cluster-role.yaml rename to manifests/0prometheus-operator-clusterRole.yaml diff --git a/manifests/0prometheus-operator-cluster-role-binding.yaml b/manifests/0prometheus-operator-clusterRoleBinding.yaml similarity index 100% rename from manifests/0prometheus-operator-cluster-role-binding.yaml rename to manifests/0prometheus-operator-clusterRoleBinding.yaml diff --git a/manifests/0prometheus-operator-service-account.yaml b/manifests/0prometheus-operator-serviceAccount.yaml similarity index 100% rename from manifests/0prometheus-operator-service-account.yaml rename to manifests/0prometheus-operator-serviceAccount.yaml diff --git a/manifests/alertmanager-service-account.yaml b/manifests/alertmanager-serviceAccount.yaml similarity index 100% rename from manifests/alertmanager-service-account.yaml rename to manifests/alertmanager-serviceAccount.yaml diff --git a/manifests/alertmanager-service-monitor.yaml b/manifests/alertmanager-serviceMonitor.yaml similarity index 100% rename from manifests/alertmanager-service-monitor.yaml rename to manifests/alertmanager-serviceMonitor.yaml diff --git a/manifests/grafana-dashboard-datasources.yaml b/manifests/grafana-dashboardDatasources.yaml similarity index 100% rename from manifests/grafana-dashboard-datasources.yaml rename to manifests/grafana-dashboardDatasources.yaml diff --git a/manifests/grafana-dashboard-definitions.yaml b/manifests/grafana-dashboardDefinitions.yaml similarity index 100% rename from manifests/grafana-dashboard-definitions.yaml rename to manifests/grafana-dashboardDefinitions.yaml diff --git a/manifests/grafana-dashboard-sources.yaml b/manifests/grafana-dashboardSources.yaml similarity index 100% rename from manifests/grafana-dashboard-sources.yaml rename to manifests/grafana-dashboardSources.yaml diff --git a/manifests/grafana-service-account.yaml b/manifests/grafana-serviceAccount.yaml similarity index 100% rename from manifests/grafana-service-account.yaml rename to manifests/grafana-serviceAccount.yaml diff --git a/manifests/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics-clusterRole.yaml similarity index 100% rename from manifests/kube-state-metrics-cluster-role.yaml rename to manifests/kube-state-metrics-clusterRole.yaml diff --git a/manifests/kube-state-metrics-cluster-role-binding.yaml b/manifests/kube-state-metrics-clusterRoleBinding.yaml similarity index 100% rename from manifests/kube-state-metrics-cluster-role-binding.yaml rename to manifests/kube-state-metrics-clusterRoleBinding.yaml diff --git a/manifests/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics-roleBinding.yaml similarity index 100% rename from manifests/kube-state-metrics-role-binding.yaml rename to manifests/kube-state-metrics-roleBinding.yaml diff --git a/manifests/kube-state-metrics-service-account.yaml b/manifests/kube-state-metrics-serviceAccount.yaml similarity index 100% rename from manifests/kube-state-metrics-service-account.yaml rename to manifests/kube-state-metrics-serviceAccount.yaml diff --git a/manifests/kube-state-metrics-service-monitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml similarity index 100% rename from manifests/kube-state-metrics-service-monitor.yaml rename to manifests/kube-state-metrics-serviceMonitor.yaml diff --git a/manifests/node-exporter-cluster-role.yaml b/manifests/node-exporter-clusterRole.yaml similarity index 100% rename from manifests/node-exporter-cluster-role.yaml rename to manifests/node-exporter-clusterRole.yaml diff --git a/manifests/node-exporter-cluster-role-binding.yaml b/manifests/node-exporter-clusterRoleBinding.yaml similarity index 100% rename from manifests/node-exporter-cluster-role-binding.yaml rename to manifests/node-exporter-clusterRoleBinding.yaml diff --git a/manifests/node-exporter-service-account.yaml b/manifests/node-exporter-serviceAccount.yaml similarity index 100% rename from manifests/node-exporter-service-account.yaml rename to manifests/node-exporter-serviceAccount.yaml diff --git a/manifests/node-exporter-service-monitor.yaml b/manifests/node-exporter-serviceMonitor.yaml similarity index 100% rename from manifests/node-exporter-service-monitor.yaml rename to manifests/node-exporter-serviceMonitor.yaml diff --git a/manifests/prometheus-cluster-role.yaml b/manifests/prometheus-clusterRole.yaml similarity index 100% rename from manifests/prometheus-cluster-role.yaml rename to manifests/prometheus-clusterRole.yaml diff --git a/manifests/prometheus-cluster-role-binding.yaml b/manifests/prometheus-clusterRoleBinding.yaml similarity index 100% rename from manifests/prometheus-cluster-role-binding.yaml rename to manifests/prometheus-clusterRoleBinding.yaml diff --git a/manifests/prometheus-role-binding-config.yaml b/manifests/prometheus-roleBindingConfig.yaml similarity index 100% rename from manifests/prometheus-role-binding-config.yaml rename to manifests/prometheus-roleBindingConfig.yaml diff --git a/manifests/prometheus-role-binding-default.yaml b/manifests/prometheus-roleBindingDefault.yaml similarity index 100% rename from manifests/prometheus-role-binding-default.yaml rename to manifests/prometheus-roleBindingDefault.yaml diff --git a/manifests/prometheus-role-binding-kube-system.yaml b/manifests/prometheus-roleBindingKubeSystem.yaml similarity index 100% rename from manifests/prometheus-role-binding-kube-system.yaml rename to manifests/prometheus-roleBindingKubeSystem.yaml diff --git a/manifests/prometheus-role-binding-namespace.yaml b/manifests/prometheus-roleBindingNamespace.yaml similarity index 100% rename from manifests/prometheus-role-binding-namespace.yaml rename to manifests/prometheus-roleBindingNamespace.yaml diff --git a/manifests/prometheus-role-config.yaml b/manifests/prometheus-roleConfig.yaml similarity index 100% rename from manifests/prometheus-role-config.yaml rename to manifests/prometheus-roleConfig.yaml diff --git a/manifests/prometheus-role-default.yaml b/manifests/prometheus-roleDefault.yaml similarity index 100% rename from manifests/prometheus-role-default.yaml rename to manifests/prometheus-roleDefault.yaml diff --git a/manifests/prometheus-role-kube-system.yaml b/manifests/prometheus-roleKubeSystem.yaml similarity index 100% rename from manifests/prometheus-role-kube-system.yaml rename to manifests/prometheus-roleKubeSystem.yaml diff --git a/manifests/prometheus-role-namespace.yaml b/manifests/prometheus-roleNamespace.yaml similarity index 100% rename from manifests/prometheus-role-namespace.yaml rename to manifests/prometheus-roleNamespace.yaml diff --git a/manifests/prometheus-service-account.yaml b/manifests/prometheus-serviceAccount.yaml similarity index 100% rename from manifests/prometheus-service-account.yaml rename to manifests/prometheus-serviceAccount.yaml diff --git a/manifests/prometheus-service-monitor-apiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml similarity index 100% rename from manifests/prometheus-service-monitor-apiserver.yaml rename to manifests/prometheus-serviceMonitorApiserver.yaml diff --git a/manifests/prometheus-service-monitor-core-dns.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml similarity index 100% rename from manifests/prometheus-service-monitor-core-dns.yaml rename to manifests/prometheus-serviceMonitorCoreDNS.yaml diff --git a/manifests/prometheus-service-monitor-kube-controller-manager.yaml b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml similarity index 100% rename from manifests/prometheus-service-monitor-kube-controller-manager.yaml rename to manifests/prometheus-serviceMonitorKubeControllerManager.yaml diff --git a/manifests/prometheus-service-monitor-kube-scheduler.yaml b/manifests/prometheus-serviceMonitorKubeScheduler.yaml similarity index 100% rename from manifests/prometheus-service-monitor-kube-scheduler.yaml rename to manifests/prometheus-serviceMonitorKubeScheduler.yaml diff --git a/manifests/prometheus-service-monitor-kubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml similarity index 100% rename from manifests/prometheus-service-monitor-kubelet.yaml rename to manifests/prometheus-serviceMonitorKubelet.yaml diff --git a/manifests/prometheus-service-monitor-prometheus.yaml b/manifests/prometheus-serviceMonitorPrometheus.yaml similarity index 100% rename from manifests/prometheus-service-monitor-prometheus.yaml rename to manifests/prometheus-serviceMonitorPrometheus.yaml diff --git a/manifests/prometheus-service-monitor-prometheus-operator.yaml b/manifests/prometheus-serviceMonitorPrometheusOperator.yaml similarity index 100% rename from manifests/prometheus-service-monitor-prometheus-operator.yaml rename to manifests/prometheus-serviceMonitorPrometheusOperator.yaml From 116aaf88be04f79b20776c1514243c85bd6d5ec0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 8 May 2018 07:36:13 -0700 Subject: [PATCH 246/638] kube-prometheus: regenerate --- manifests/grafana-dashboardDefinitions.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 8fd4a0d6..af7e2749 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -794,7 +794,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_free{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_avail{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1703,7 +1703,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "1 - sum(max by (device, node) (node_filesystem_free{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", + "expr": "1 - sum(max by (device, node) (node_filesystem_avail{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Disk", @@ -5258,7 +5258,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_free{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", + "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_avail{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" From 2a6b3db38bcfe951aadcf8c105cd3b8aa6b303b5 Mon Sep 17 00:00:00 2001 From: Giancarlo Rubio Date: Thu, 10 May 2018 17:50:14 +0200 Subject: [PATCH 247/638] Fix CI running make generate (#1327) --- manifests/prometheus-rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 852a3362..a18275b6 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -94,7 +94,7 @@ data: \ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\": - \n \"message\": \"Deployment {{ $labels.namespace }}/{{ labels.deployment + \n \"message\": \"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n \ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - From a9e667d24cfa8025be8e8672fa98aa40268546bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?piglei=E2=84=A2?= Date: Fri, 11 May 2018 16:59:34 +0800 Subject: [PATCH 248/638] kube-prometheus: fix alert rule K8SManyNodesNotReady (#1313) * kube-prometheus: fix alert rule K8SManyNodesNotReady * fix alert "K8SManyNodesNotReady" in helm templates & make generate * Use sync_kube_prometheus.py to make rules in helm in sync --- assets/prometheus/rules/kubelet.rules.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 85547dd6..a4168404 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -13,7 +13,7 @@ groups: - alert: K8SManyNodesNotReady expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == - 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) * 100 > 20 for: 1m labels: severity: critical From 0124e8c27231871465579429c3bf7a569ba9bcdb Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Fri, 11 May 2018 11:23:10 +0200 Subject: [PATCH 249/638] contrib/kube-prometheus: add `-f` to `rm` in `build.sh` According to the man pages of `rm` and the `-f` option: > Attempt to remove the files without prompting for confirma- tion, > regardless of the file's permissions. If the file does not exist, do > not display a diagnostic message or modify the exit status to reflect an > error. The -f option overrides any previous -i options. This patch prevents the `build.sh` script from failing if files do not exist when attempting to delete them. --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index cefa57e8..30e75852 100755 --- a/build.sh +++ b/build.sh @@ -3,5 +3,5 @@ set -e set -x # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm $1' -- {} +jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} From 0461c85098b84ea607a323c1815bb9acdf80bea0 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 14 May 2018 11:55:16 +0200 Subject: [PATCH 250/638] docs: Run embedmd on both Docs and kube-prometheus README.md --- README.md | 4 +++- build.sh | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d1d27f8..663f7831 100644 --- a/README.md +++ b/README.md @@ -83,9 +83,11 @@ To compile the above and get each manifest in a separate file on disk use the fo #!/usr/bin/env bash set -e set -x +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm $1' -- {} +jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} ``` diff --git a/build.sh b/build.sh index 30e75852..9c22672e 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash set -e set -x +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail # optional, but we would like to generate yaml, not json jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} From fef0a659381b16b3385b2cd4cd5305405e4e13d0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 15 May 2018 18:31:58 +0200 Subject: [PATCH 251/638] Adapt docs on additional rules and dashboards --- Makefile | 10 +- README.md | 26 ++- build.sh | 2 +- docs/developing-alerts-and-dashboards.md | 41 ---- ...prometheus-rules-and-grafana-dashboards.md | 217 ++++++++++++++++++ ...al etcd.md => monitoring-external-etcd.md} | 0 examples/example-grafana-dashboard.json | 177 ++++++++++++++ examples/example.rules.yaml | 9 + ...ditional-jsonnet-dashboard-example.jsonnet | 45 ++++ ...itional-rendered-dashboard-example.jsonnet | 16 ++ .../build-snippet.jsonnet | 7 + .../{ => jsonnet-snippets}/bootkube.jsonnet | 0 .../{ => jsonnet-snippets}/kubeadm.jsonnet | 0 .../{ => jsonnet-snippets}/node-ports.jsonnet | 0 examples/ksonnet-example.jsonnet | 14 +- ...heus-additional-alert-rule-example.jsonnet | 32 +++ ...-additional-recording-rule-example.jsonnet | 26 +++ ...s-additional-rendered-rule-example.jsonnet | 18 ++ .../prometheus/prometheus.libsonnet | 3 +- test.sh | 32 +++ 20 files changed, 613 insertions(+), 62 deletions(-) delete mode 100644 docs/developing-alerts-and-dashboards.md create mode 100644 docs/developing-prometheus-rules-and-grafana-dashboards.md rename docs/{Monitoring external etcd.md => monitoring-external-etcd.md} (100%) create mode 100644 examples/example-grafana-dashboard.json create mode 100644 examples/example.rules.yaml create mode 100644 examples/grafana-additional-jsonnet-dashboard-example.jsonnet create mode 100644 examples/grafana-additional-rendered-dashboard-example.jsonnet create mode 100644 examples/jsonnet-build-snippet/build-snippet.jsonnet rename examples/{ => jsonnet-snippets}/bootkube.jsonnet (100%) rename examples/{ => jsonnet-snippets}/kubeadm.jsonnet (100%) rename examples/{ => jsonnet-snippets}/node-ports.jsonnet (100%) create mode 100644 examples/prometheus-additional-alert-rule-example.jsonnet create mode 100644 examples/prometheus-additional-recording-rule-example.jsonnet create mode 100644 examples/prometheus-additional-rendered-rule-example.jsonnet create mode 100755 test.sh diff --git a/Makefile b/Makefile index a7903cf8..9fc6113e 100644 --- a/Makefile +++ b/Makefile @@ -14,4 +14,12 @@ generate-raw: jb install ./build.sh -.PHONY: image generate crdtojsonnet generate-raw +test: image + @echo ">> Compiling assets and generating Kubernetes manifests" + docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make test-raw + +test-raw: crdtojsonnet + jb install + ./test.sh + +.PHONY: image generate crdtojsonnet generate-raw test diff --git a/README.md b/README.md index 663f7831..846239b8 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ set -x set -o pipefail # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} +jsonnet -J vendor -m manifests ${1-example.jsonnet} | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} ``` @@ -145,7 +145,7 @@ A common example is that not all Kubernetes clusters are created exactly the sam kubeadm: -[embedmd]:# (examples/kubeadm.jsonnet) +[embedmd]:# (examples/jsonnet-snippets/kubeadm.jsonnet) ```jsonnet (import "kube-prometheus/kube-prometheus.libsonnet") + (import "kube-prometheus/kube-prometheus-kubeadm.libsonnet") @@ -153,7 +153,7 @@ kubeadm: bootkube: -[embedmd]:# (examples/bootkube.jsonnet) +[embedmd]:# (examples/jsonnet-snippets/bootkube.jsonnet) ```jsonnet (import "kube-prometheus/kube-prometheus.libsonnet") + (import "kube-prometheus/kube-prometheus-bootkube.libsonnet") @@ -161,7 +161,7 @@ bootkube: Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: -[embedmd]:# (examples/node-ports.jsonnet) +[embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) ```jsonnet (import "kube-prometheus/kube-prometheus.libsonnet") + (import "kube-prometheus/kube-prometheus-node-ports.libsonnet") @@ -186,17 +186,21 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github [embedmd]:# (examples/ksonnet-example.jsonnet) ```jsonnet -local k = import "ksonnet/ksonnet.beta.3/k.libsonnet"; +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local daemonset = k.apps.v1beta2.daemonSet; -((import "kube-prometheus/kube-prometheus.libsonnet") + { - nodeExporter+: { - daemonset+: - daemonset.mixin.metadata.withNamespace("my-custom-namespace") - } -}).nodeExporter.daemonset +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + nodeExporter+: { + daemonset+: + daemonset.mixin.metadata.withNamespace('my-custom-namespace'), + }, + }).nodeExporter.daemonset ``` +### Customizing Prometheus alerting/recording rules and Grafana dashboards + +See [developing alerts and dashboards](developing-alerts-and-dashboards.md) guide. + ## Example To use an easy to reproduce example, let's take the minikube setup as demonstrated in [prerequisites](#Prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: diff --git a/build.sh b/build.sh index 9c22672e..d8af6681 100755 --- a/build.sh +++ b/build.sh @@ -5,5 +5,5 @@ set -x set -o pipefail # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} +jsonnet -J vendor -m manifests ${1-example.jsonnet} | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md deleted file mode 100644 index ed3a2a06..00000000 --- a/docs/developing-alerts-and-dashboards.md +++ /dev/null @@ -1,41 +0,0 @@ -# Developing Prometheus Rules and Grafana Dashboards - -`kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this. - -For both the Prometheus rules and the Grafana dashboards there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. - -The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. - -Note: `make generate` should be executed from kube-prometheus base directory. - -## Prometheus Rules - -The `ConfigMap` that is generated and holds the Prometheus rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. - -It is generated from all the `*.rules.yaml` files in the `assets/prometheus/rules/` directory. - -To extend the rules simply add a new `.rules.yaml` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules.yaml` file and re-generate the manifest. - -Then the generated manifest can be applied against a Kubernetes cluster. - -## Dashboards - -The generated `ConfigMap`s holding the Grafana dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. - -The dashboards themselves get generated from Python scripts: assets/grafana/\*.dashboard.py. -These scripts are loaded by the [grafanalib](https://github.com/aknuds1/grafanalib) -Grafana dashboard generator, which turns them into dashboards. - -Bear in mind that we are for now using a fork of grafanalib as we needed to make extensive -changes to it, in order to be able to generate our dashboards. We are hoping to be able to -consolidate our version with the original. - -After changing grafanalib scripts in assets/grafana, or adding your own, you'll have to run -`make generate` in the kube-prometheus root directory in order to re-generate the dashboards -manifest. You can deploy the latter with kubectl similar to the following: - -``` -kubectl -n monitoring apply -f manifests/grafana/grafana-dashboards.yaml -``` - -This should cause Grafana to re-load its dashboards automatically. diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md new file mode 100644 index 00000000..edd7c656 --- /dev/null +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -0,0 +1,217 @@ +# Developing Prometheus Rules and Grafana Dashboards + +`kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this. + +All manifests of kube-prometheus are generated using [jsonnet](https://jsonnet.org/) and Prometheus rules and Grafana dashboards in specific follow the [Prometheus Monitoring Mixins proposal](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/). + +For both the Prometheus rules and the Grafana dashboards Kubernetes `ConfigMap`s are generated within kube-prometheus. In order to add additional rules and dashboards simply merge them onto the existing json objects. This document illustrates examples for rules as well as dashboards. + +As a basis, all examples in this guide are based on the base example of the kube-prometheus [readme](../README.md): + +[embedmd]:# (../example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +## Prometheus rules + +### Alerting rules + +According to the [Prometheus Monitoring Mixins proposal](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/) Prometheus alerting rules are under the key `prometheusAlerts` in the top level object, so in order to add an additional alerting rule, we can simply merge an extra rule into the existing object. + +The format is exactly the Prometheus format, so there should be no changes necessary should you have existing rules that you want to include. + +> Note that alerts can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file. + +[embedmd]:# (../examples/prometheus-additional-alert-rule-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + alert: 'DeadMansSwitch', + expr: 'vector(1)', + labels: { + severity: 'none', + }, + annotations: { + description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + }, + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +### Recording rules + +In order to add a recording rule, simply do the same with the `prometheusRules` field. + +> Note that rules can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file. + +[embedmd]:# (../examples/prometheus-additional-recording-rule-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusRules+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +### Pre-rendered rules + +We acknowledge, that users may need to transition existing rules, and therefore allow an option to add additional pre-rendered rules. This can be done simply by importing the existing rules in the [Prometheus rule format](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) using the jsonnet function `importstr`. In this example we are importing a [provided example rule](examples/example.rules.yaml). + +[embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + prometheus+:: { + renderedRules: { + 'example.rules.yaml': (importstr 'example.rules.yaml'), + }, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +## Dashboards + +Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard. + +### Jsonnet dashboard + +We recommend using the [grafonnet]() library for jsonnet, which gives you a simple DSL to generate Grafana dashboards. Following the [Prometheus Monitoring Mixins proposal](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/) additional dashboards are added to the `grafanaDashboards` key, located in the top level object. To add new jsonnet dashboards, simply add one. + +> Note that dashboards can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file. + +[embedmd]:# (../examples/grafana-additional-jsonnet-dashboard-example.jsonnet) +```jsonnet +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + grafanaDashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +### Pre-rendered Grafana dashboards + +As jsonnet is a superset of json, the jsonnet `import` function can be used to include Grafana dashboard json blobs. In this example we are importing a [provided example dashboard](examples/example-grafana-dashboard.json). + +[embedmd]:# (../examples/grafana-additional-rendered-dashboard-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + grafanaDashboards+:: { + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` diff --git a/docs/Monitoring external etcd.md b/docs/monitoring-external-etcd.md similarity index 100% rename from docs/Monitoring external etcd.md rename to docs/monitoring-external-etcd.md diff --git a/examples/example-grafana-dashboard.json b/examples/example-grafana-dashboard.json new file mode 100644 index 00000000..a891040b --- /dev/null +++ b/examples/example-grafana-dashboard.json @@ -0,0 +1,177 @@ +{ + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetid": null, + "graphtooltip": 0, + "hidecontrols": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliascolors": { + + }, + "bars": false, + "dashlength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridpos": { + + }, + "id": 2, + "legend": { + "alignastable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightside": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullpointmode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesoverrides": [ + + ], + "spacelength": 10, + "span": 6, + "stack": false, + "steppedline": false, + "targets": [ + { + "expr": "vector(1)", + "format": "time_series", + "intervalfactor": 2, + "legendformat": "", + "refid": "a" + } + ], + "thresholds": [ + + ], + "timefrom": null, + "timeshift": null, + "title": "my panel", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logbase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logbase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatiteration": null, + "repeatrowid": null, + "showtitle": false, + "title": "dashboard row", + "titlesize": "h6", + "type": "row" + } + ], + "schemaversion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "my dashboard", + "version": 0 +} diff --git a/examples/example.rules.yaml b/examples/example.rules.yaml new file mode 100644 index 00000000..94d9d691 --- /dev/null +++ b/examples/example.rules.yaml @@ -0,0 +1,9 @@ +groups: +- name: example-group + rules: + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: "none" + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. diff --git a/examples/grafana-additional-jsonnet-dashboard-example.jsonnet b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet new file mode 100644 index 00000000..578d6a1f --- /dev/null +++ b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet @@ -0,0 +1,45 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + grafanaDashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/grafana-additional-rendered-dashboard-example.jsonnet b/examples/grafana-additional-rendered-dashboard-example.jsonnet new file mode 100644 index 00000000..8aa26bdc --- /dev/null +++ b/examples/grafana-additional-rendered-dashboard-example.jsonnet @@ -0,0 +1,16 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + grafanaDashboards+:: { + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/jsonnet-build-snippet/build-snippet.jsonnet b/examples/jsonnet-build-snippet/build-snippet.jsonnet new file mode 100644 index 00000000..5a11cef6 --- /dev/null +++ b/examples/jsonnet-build-snippet/build-snippet.jsonnet @@ -0,0 +1,7 @@ +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/bootkube.jsonnet b/examples/jsonnet-snippets/bootkube.jsonnet similarity index 100% rename from examples/bootkube.jsonnet rename to examples/jsonnet-snippets/bootkube.jsonnet diff --git a/examples/kubeadm.jsonnet b/examples/jsonnet-snippets/kubeadm.jsonnet similarity index 100% rename from examples/kubeadm.jsonnet rename to examples/jsonnet-snippets/kubeadm.jsonnet diff --git a/examples/node-ports.jsonnet b/examples/jsonnet-snippets/node-ports.jsonnet similarity index 100% rename from examples/node-ports.jsonnet rename to examples/jsonnet-snippets/node-ports.jsonnet diff --git a/examples/ksonnet-example.jsonnet b/examples/ksonnet-example.jsonnet index e83ceaf0..565d113f 100644 --- a/examples/ksonnet-example.jsonnet +++ b/examples/ksonnet-example.jsonnet @@ -1,9 +1,9 @@ -local k = import "ksonnet/ksonnet.beta.3/k.libsonnet"; +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local daemonset = k.apps.v1beta2.daemonSet; -((import "kube-prometheus/kube-prometheus.libsonnet") + { - nodeExporter+: { - daemonset+: - daemonset.mixin.metadata.withNamespace("my-custom-namespace") - } -}).nodeExporter.daemonset +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + nodeExporter+: { + daemonset+: + daemonset.mixin.metadata.withNamespace('my-custom-namespace'), + }, + }).nodeExporter.daemonset diff --git a/examples/prometheus-additional-alert-rule-example.jsonnet b/examples/prometheus-additional-alert-rule-example.jsonnet new file mode 100644 index 00000000..b8d16af8 --- /dev/null +++ b/examples/prometheus-additional-alert-rule-example.jsonnet @@ -0,0 +1,32 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + alert: 'DeadMansSwitch', + expr: 'vector(1)', + labels: { + severity: 'none', + }, + annotations: { + description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + }, + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/prometheus-additional-recording-rule-example.jsonnet b/examples/prometheus-additional-recording-rule-example.jsonnet new file mode 100644 index 00000000..7974e338 --- /dev/null +++ b/examples/prometheus-additional-recording-rule-example.jsonnet @@ -0,0 +1,26 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusRules+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/prometheus-additional-rendered-rule-example.jsonnet b/examples/prometheus-additional-rendered-rule-example.jsonnet new file mode 100644 index 00000000..4ee7317d --- /dev/null +++ b/examples/prometheus-additional-rendered-rule-example.jsonnet @@ -0,0 +1,18 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + prometheus+:: { + renderedRules: { + 'example.rules.yaml': (importstr 'example.rules.yaml'), + }, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 3b2d415c..d2ae7ae6 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -15,6 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus+:: { replicas: 2, rules: {}, + renderedRules: {}, }, }, @@ -36,7 +37,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; rules: local configMap = k.core.v1.configMap; - configMap.new('prometheus-k8s-rules', { 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) }) + + configMap.new('prometheus-k8s-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: 'k8s' }) + configMap.mixin.metadata.withNamespace($._config.namespace), roleBindingDefault: diff --git a/test.sh b/test.sh new file mode 100755 index 00000000..dad4e75f --- /dev/null +++ b/test.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail + + +for i in examples/jsonnet-snippets/*.jsonnet; do + [ -f "$i" ] || break + echo "Testing: ${i}" + echo "" + snippet="local kp = $(<${i}); + +$( "test.jsonnet" + echo "\`\`\`" + echo "${snippet}" + echo "\`\`\`" + echo "" + jsonnet -J vendor "test.jsonnet" > /dev/null + rm -rf "test.jsonnet" +done + +for i in examples/*.jsonnet; do + [ -f "$i" ] || break + echo "Testing: ${i}" + echo "" + echo "\`\`\`" + echo "$(<${i})" + echo "\`\`\`" + echo "" + jsonnet -J vendor ${i} > /dev/null +done From 9e180452f86aac8a895634422d02f696e65c6d68 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 8 May 2018 09:43:45 +0200 Subject: [PATCH 252/638] prometheus: Introduce RuleFile Custom Resource Definition This patch introduces a new Custom Resource Definition to the Prometheus Operator - the Rule CRD. It addresses two main needs: 1. Prometheus (alerting and recording) Rule validation during creation time via Kubernetes Custom Resource Definition validation. 2. Life-cycle management of Prometheus application Rules alongside the application itself, inside the applications Kubernetes namespace, not necessarily the namespace of the scraping Prometheus instance. A user defines Prometheus alerting and recording Rules via a Kubernetes Custom Resource Definition. These Custom Resource Definitions can be fully validated by the Kubernetes API server during creation time via automatically generated OpenAPI specifications. Instead of the restriction of a Prometheus instance to only select Rule definitions inside its own namespace, the Prometheus specification is extended to also specify namespaces to look for Rule Custom Resource Definitions outside its own namespace. --- Dependent technical changes: - prometheus: Use github.com/jimmidyson/configmap-reload to reload rules - prometheus: Remove Prometheus Statefulset deletion function. Starting with K8s >=1.8 this is handled via OwnerReferences. - prometheus: Do not add rule files checksum to Prometheus configuration secret - prometheus: Update StatefulSet only on relevant changes. Instead of updating the Prometheus StatefulSet on every `sync()` run, only update it if the input parameters to `makeStatefulSet` change. Enforce this via a checksum of the parameters which is saved inside the annotations of the statefulset. - e2e/prometheus: Check how often resources (Secret, ConfigMap, Prometheus CRD, Service) are updated to enforce that Prometheus Operator only updated created resources if necessary. - contrib/prometheus-config-reloader: Remove logic to retriev K8s ConfigMaps. These are mounted into the pod right away now. --- Makefile | 5 +- README.md | 4 + build.sh | 4 + .../alertmanager-crd.libsonnet | 2 +- .../prometheus-crd.libsonnet | 2 +- .../rulefile-crd.libsonnet | 1 + ...0alertmanagerCustomResourceDefinition.yaml | 20 ++- ...r-0prometheusCustomResourceDefinition.yaml | 119 +++++++++++++++++- manifests/prometheus-rules.yaml | 5 +- 9 files changed, 150 insertions(+), 12 deletions(-) create mode 100644 jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet diff --git a/Makefile b/Makefile index a7903cf8..b09cdecd 100644 --- a/Makefile +++ b/Makefile @@ -3,14 +3,15 @@ image: generate: image @echo ">> Compiling assets and generating Kubernetes manifests" - docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make crdtojsonnet generate-raw + docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw crdtojsonnet: cat ../../example/prometheus-operator-crd/alertmanager.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet cat ../../example/prometheus-operator-crd/prometheus.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet + cat ../../example/prometheus-operator-crd/rulefile.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet -generate-raw: +generate-raw: crdtojsonnet jb install ./build.sh diff --git a/README.md b/README.md index 663f7831..4655f469 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,10 @@ set -x # only exit with zero if all commands of the pipeline exit successfully set -o pipefail +# Make sure to start with a clean 'manifests' dir +rm -rf manifests +mkdir manifests + # optional, but we would like to generate yaml, not json jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} diff --git a/build.sh b/build.sh index 9c22672e..d78d5d28 100755 --- a/build.sh +++ b/build.sh @@ -4,6 +4,10 @@ set -x # only exit with zero if all commands of the pipeline exit successfully set -o pipefail +# Make sure to start with a clean 'manifests' dir +rm -rf manifests +mkdir manifests + # optional, but we would like to generate yaml, not json jsonnet -J vendor -m manifests example.jsonnet | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} diff --git a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet index 1970adc7..604a81f6 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifer to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is alpha in 1.8 and can be reworked or removed in a future release.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet index d6b13ad9..10e32b6d 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifer to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is alpha in 1.8 and can be reworked or removed in a future release.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleFileNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleFileSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet new file mode 100644 index 00000000..cb8d02fc --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet @@ -0,0 +1 @@ +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"rulefiles.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"RuleFile","plural":"rulefiles"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"RuleFile defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"RuleFileSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"format":"int64","type":"integer"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"format":"int64","type":"integer"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index b0976073..081e1d4d 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -666,7 +666,7 @@ spec: description: Specify whether the ConfigMap must be defined type: boolean prefix: - description: An optional identifer to prepend to each key + description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. type: string secretRef: @@ -1120,6 +1120,14 @@ spec: description: Whether this container has a read-only root filesystem. Default is false. type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container + process. Uses runtime default if unset. May also be set + in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext + takes precedence. + format: int64 + type: integer runAsNonRoot: description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime @@ -1231,8 +1239,7 @@ spec: description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer - is used. This field is alpha in 1.8 and can be reworked - or removed in a future release. + is used. This field is beta in 1.10. type: string name: description: This must match the Name of a Volume. @@ -1614,6 +1621,13 @@ spec: If unset, the Kubelet will not modify the ownership and permissions of any volume. format: int64 type: integer + runAsGroup: + description: The GID to run the entrypoint of the container process. + Uses runtime default if unset. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, the value + specified in SecurityContext takes precedence for that container. + format: int64 + type: integer runAsNonRoot: description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index e30f5bb2..8cc1a59c 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -27,6 +27,21 @@ spec: description: 'Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: + additionalAlertManagerConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key additionalScrapeConfigs: description: SecretKeySelector selects a key of a Secret. properties: @@ -734,7 +749,7 @@ spec: description: Specify whether the ConfigMap must be defined type: boolean prefix: - description: An optional identifer to prepend to each key + description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. type: string secretRef: @@ -1188,6 +1203,14 @@ spec: description: Whether this container has a read-only root filesystem. Default is false. type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container + process. Uses runtime default if unset. May also be set + in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext + takes precedence. + format: int64 + type: integer runAsNonRoot: description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime @@ -1299,8 +1322,7 @@ spec: description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer - is used. This field is alpha in 1.8 and can be reworked - or removed in a future release. + is used. This field is beta in 1.10. type: string name: description: This must match the Name of a Volume. @@ -1877,6 +1899,90 @@ spec: the server serves requests under a different route prefix. For example for use with `kubectl proxy`. type: string + ruleFileNamespaceSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + ruleFileSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object ruleSelector: description: A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty @@ -1948,6 +2054,13 @@ spec: If unset, the Kubelet will not modify the ownership and permissions of any volume. format: int64 type: integer + runAsGroup: + description: The GID to run the entrypoint of the container process. + Uses runtime default if unset. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, the value + specified in SecurityContext takes precedence for that container. + format: int64 + type: integer runAsNonRoot: description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index a18275b6..7354a6e9 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -16,8 +16,9 @@ data: \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}) by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"}) - by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, + \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} + and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n + \ * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n- \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n From b7949ee5204f4ee3fd5057c8e3f21e807a16b874 Mon Sep 17 00:00:00 2001 From: hsinhoyeh Date: Sun, 20 May 2018 10:26:02 +0800 Subject: [PATCH 253/638] fixed dashboard example json path --- docs/developing-prometheus-rules-and-grafana-dashboards.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index edd7c656..9f1166ce 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -194,7 +194,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { ### Pre-rendered Grafana dashboards -As jsonnet is a superset of json, the jsonnet `import` function can be used to include Grafana dashboard json blobs. In this example we are importing a [provided example dashboard](examples/example-grafana-dashboard.json). +As jsonnet is a superset of json, the jsonnet `import` function can be used to include Grafana dashboard json blobs. In this example we are importing a [provided example dashboard](../examples/example-grafana-dashboard.json). [embedmd]:# (../examples/grafana-additional-rendered-dashboard-example.jsonnet) ```jsonnet From 2408e243b2188618686718015c0e9957c174d4b6 Mon Sep 17 00:00:00 2001 From: hsinhoyeh Date: Sun, 20 May 2018 10:53:31 +0800 Subject: [PATCH 254/638] fix again --- README.md | 2 +- docs/monitoring-external-etcd.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 047a4c97..3230567d 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ local daemonset = k.apps.v1beta2.daemonSet; ### Customizing Prometheus alerting/recording rules and Grafana dashboards -See [developing alerts and dashboards](developing-alerts-and-dashboards.md) guide. +See [developing alerts and dashboards](developing-prometheus-rules-and-grafana-dashboards.md) guide. ## Example diff --git a/docs/monitoring-external-etcd.md b/docs/monitoring-external-etcd.md index f46f4010..bfdc58a3 100644 --- a/docs/monitoring-external-etcd.md +++ b/docs/monitoring-external-etcd.md @@ -153,10 +153,10 @@ https://grafana.com/dashboards/3070 ## Save the dashboard in the configmap -As documented here, [Developing Alerts and Dashboards](developing-alerts-and-dashboards.md), the Grafana instances are stateless. The dashboards are automatically re-loaded from the ConfigMap. +As documented here, [Developing Alerts and Dashboards](developing-prometheus-rules-and-grafana-dashboards.md), the Grafana instances are stateless. The dashboards are automatically re-loaded from the ConfigMap. So if you load a dashboard through the Grafana UI, it won't be kept unless saved in ConfigMap -Read [the document](developing-alerts-and-dashboards.md), but in summary: +Read [the document](developing-prometheus-rules-and-grafana-dashboards.md), but in summary: ### Copy your dashboard: Once you are happy with the dashboard, export it and move it to `prometheus-operator/contrib/kube-prometheus/assets/grafana/` (ending in "-dashboard.json") From 3bd7d36abd2a2902522e815a2606ccc885c0ab5b Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Mon, 21 May 2018 16:18:57 -0500 Subject: [PATCH 255/638] add the watch verb (to the namespace apiGroup for the prometheus-operator ClusterRole). To get the "Failed to watch" error to stop happening in the prometheus-operator logs. Fixes #1324 for kube-prometheus. --- .../prometheus-operator/prometheus-operator.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index c1003ebf..32864026 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -100,7 +100,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withResources([ 'namespaces', ]) + - policyRule.withVerbs(['list']); + policyRule.withVerbs(['list', 'watch']); local rules = [extensionsRule, apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; From 2b81c0090b1db097ba6e0fd7fe035443dae9d7a2 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Thu, 24 May 2018 16:35:19 -0500 Subject: [PATCH 256/638] After modifying prometheus-operator.libsonnet (see commit ec19c16), these are the changes that were made to 0prometheus-operator-clusterRole.yaml by running "make generate". --- manifests/0prometheus-operator-clusterRole.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index 76f943df..8c85391f 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -67,3 +67,4 @@ rules: - namespaces verbs: - list + - watch From 3605c0eb44be437fcfd5737ae74b4ad0d766d7b3 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Thu, 24 May 2018 16:53:32 -0500 Subject: [PATCH 257/638] Also checking in the changes made to prometheus-rules.yaml by "make generate", in order to (hopefully) get the build to be green. --- manifests/prometheus-rules.yaml | 122 +++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 10 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 7354a6e9..d6e5d124 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -20,6 +20,44 @@ data: and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n \ * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n- + \"name\": \"kube-scheduler.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, + sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n + \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n + \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n + \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n + \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n + \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n + \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n + \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n + \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n + \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- + \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, + sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) without(instance, + pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": + \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": + |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n + \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n + \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n \ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", @@ -84,7 +122,21 @@ data: by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-apps\"\n + \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-absent\"\n + \ \"rules\": \n - \"alert\": \"KubeAPIDown\"\n \"annotations\": \n \"message\": + \"KubeAPI has disappeared from Prometheus target discovery.\"\n \"expr\": |\n + \ absent(up{job=\"kube-apiserver\"} == 1)\n \"for\": \"15m\"\n \"labels\": + \n \"severity\": \"critical\"\n - \"alert\": \"KubeControllerManagerDown\"\n + \ \"annotations\": \n \"message\": \"KubeControllerManager has disappeared + from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-controller-manager\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"KubeSchedulerDown\"\n \"annotations\": \n \"message\": + \"KubeScheduler has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": + \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": + \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n + \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n @@ -103,8 +155,32 @@ data: \"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n \ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n \ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": - \"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeStatefulSetReplicasMismatch\"\n \"annotations\": \n \"message\": + \"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch\"\n + \ \"expr\": |\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n + \ !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeStatefulSetGenerationMismatch\"\n \"annotations\": \n \"message\": + \"StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch\"\n + \ \"expr\": |\n kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n + \ !=\n kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeDaemonSetRolloutStuck\"\n \"annotations\": \n \"message\": + \"Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}}\"\n + \ \"expr\": |\n kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n + \ /\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} + * 100 < 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"KubeDaemonSetNotScheduled\"\n \"annotations\": \n \"message\": + \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are + not scheduled.\"\n \"expr\": |\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n + \ -\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} + > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"KubeDaemonSetMisScheduled\"\n \"annotations\": \n \"message\": + \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are + running where they are not supposed to run.\"\n \"expr\": |\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} + > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- + \"name\": \"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n \ \"annotations\": \n \"message\": \"Overcommited CPU resource requests on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n \ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1) @@ -142,12 +218,12 @@ data: 4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\": \"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node - }} has been unready for more than an hour\"\n \"expr\": |\n max(kube_node_status_ready{job=\"kube-state-metrics\", - condition=\"false\"} == 1) BY (node)\n \"for\": \"1h\"\n \"labels\": \n - \ \"severity\": \"warning\"\n - \"alert\": \"KubeVersionMismatch\"\n \"annotations\": - \n \"message\": \"There are {{ $value }} different versions of Kubernetes - components running.\"\n \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"}) - by (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\": + }} has been unready for more than an hour\"\n \"expr\": |\n kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} + == 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"KubeVersionMismatch\"\n \"annotations\": \n \"message\": + \"There are {{ $value }} different versions of Kubernetes components running.\"\n + \ \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"}) by + (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m])) @@ -157,7 +233,33 @@ data: \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m])) by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"" + \"warning\"\n - \"alert\": \"KubeletTooManyPods\"\n \"annotations\": \n \"message\": + \"Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit + of 110.\"\n \"expr\": |\n kubelet_running_pod_count{job=\"kubelet\"} > + 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": + \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": + \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": + \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": + \"warning\"\n - \"alert\": \"KubeCertificateExpiration\"\n \"annotations\": + \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n + \ \"expr\": |\n sum(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\",le=\"604800\"}) + > 0\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeCertificateExpiration\"\n + \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring + in less than 1 day.\"\n \"expr\": |\n sum(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\",le=\"86400\"}) + > 0\n \"labels\": \n \"severity\": \"warning\"" kind: ConfigMap metadata: labels: From f1553a76b7f783a96670d5265181c145055f5d19 Mon Sep 17 00:00:00 2001 From: alpha Date: Fri, 25 May 2018 14:41:28 +0545 Subject: [PATCH 258/638] fix doc guide link of kube-prometheus --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3230567d..e5e1ddbc 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ local daemonset = k.apps.v1beta2.daemonSet; ### Customizing Prometheus alerting/recording rules and Grafana dashboards -See [developing alerts and dashboards](developing-prometheus-rules-and-grafana-dashboards.md) guide. +See [developing alerts and dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. ## Example From d8f6b6f81b9bbc8d9284af45a1d0f7d93dbf7ce1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 18 May 2018 11:52:27 +0200 Subject: [PATCH 259/638] kube-prometheus: Update kops docs --- README.md | 8 ++++++ docs/KOPSonAWS.md | 20 --------------- examples/jsonnet-snippets/kops.jsonnet | 2 ++ ...kube-prometheus-insecure-kubelet.libsonnet | 25 +++++++++++++++++++ .../kube-prometheus-kops.libsonnet | 23 +++++++++++++++++ 5 files changed, 58 insertions(+), 20 deletions(-) delete mode 100644 docs/KOPSonAWS.md create mode 100644 examples/jsonnet-snippets/kops.jsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet diff --git a/README.md b/README.md index e5e1ddbc..d67ea680 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,14 @@ bootkube: (import "kube-prometheus/kube-prometheus-bootkube.libsonnet") ``` +kops: + +[embedmd]:# (examples/jsonnet-snippets/kops.jsonnet) +```jsonnet +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') +``` + Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: [embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) diff --git a/docs/KOPSonAWS.md b/docs/KOPSonAWS.md deleted file mode 100644 index 26080a9f..00000000 --- a/docs/KOPSonAWS.md +++ /dev/null @@ -1,20 +0,0 @@ -# Adding kube-prometheus to [KOPS](https://github.com/kubernetes/kops) on AWS 1.5.x - - -## Prerequisites - -A running Kubernetes cluster created with [KOPS](https://github.com/kubernetes/kops). - -These instructions have currently been tested with **topology=public** on AWS with KOPS 1.7.1 and Kubernetes 1.7.x - -Following the instructions in the [README](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/README.md): - -Example: - -```bash -git clone -b master https://github.com/coreos/prometheus-operator.git prometheus-operator-temp; -cd prometheus-operator-temp/contrib/kube-prometheus -./hack/cluster-monitoring/self-hosted-deploy -cd - -rm -rf prometheus-operator-temp -``` diff --git a/examples/jsonnet-snippets/kops.jsonnet b/examples/jsonnet-snippets/kops.jsonnet new file mode 100644 index 00000000..4ff9ceae --- /dev/null +++ b/examples/jsonnet-snippets/kops.jsonnet @@ -0,0 +1,2 @@ +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') diff --git a/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet new file mode 100644 index 00000000..1bd64e1b --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet @@ -0,0 +1,25 @@ +{ + prometheus+:: { + serviceMonitorKubelet+: + { + spec+: { + endpoints: [ + { + port: 'http-metrics', + scheme: 'http', + interval: '30s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + { + port: 'http-metrics', + scheme: 'http', + path: '/metrics/cadvisor', + interval: '30s', + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet new file mode 100644 index 00000000..a9cf3bb3 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet @@ -0,0 +1,23 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+:: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + kubeDnsPrometheusDiscoveryService: + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + + service.mixin.spec.withClusterIp('None'), + }, +} From ca057722e501f93ce9e8b36561d8fb8ef020fd4c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 18 May 2018 12:04:33 +0200 Subject: [PATCH 260/638] Add formatting jsonnet to generate make target --- Makefile | 10 +++++-- README.md | 30 ++++++++++---------- examples/jsonnet-snippets/bootkube.jsonnet | 4 +-- examples/jsonnet-snippets/kubeadm.jsonnet | 4 +-- examples/jsonnet-snippets/node-ports.jsonnet | 4 +-- examples/prometheus-name-override.jsonnet | 18 ++++++------ 6 files changed, 38 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index baae1e20..60c6d5f3 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + image: docker build -f ../../scripts/jsonnet/Dockerfile -t po-jsonnet ../../ @@ -11,10 +13,14 @@ crdtojsonnet: cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet cat ../../example/prometheus-operator-crd/rulefile.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet -generate-raw: crdtojsonnet +generate-raw: crdtojsonnet fmt jb install ./build.sh +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + test: image @echo ">> Compiling assets and generating Kubernetes manifests" docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make test-raw @@ -23,4 +29,4 @@ test-raw: crdtojsonnet jb install ./test.sh -.PHONY: image generate crdtojsonnet generate-raw test +.PHONY: image generate crdtojsonnet generate-raw test test-raw fmt diff --git a/README.md b/README.md index d67ea680..cc392e7a 100644 --- a/README.md +++ b/README.md @@ -151,16 +151,16 @@ kubeadm: [embedmd]:# (examples/jsonnet-snippets/kubeadm.jsonnet) ```jsonnet -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-kubeadm.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') ``` bootkube: [embedmd]:# (examples/jsonnet-snippets/bootkube.jsonnet) ```jsonnet -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-bootkube.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') ``` kops: @@ -175,23 +175,23 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of [embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) ```jsonnet -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-node-ports.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') ``` For example the name of the `Prometheus` object provided by this library can be overridden: [embedmd]:# (examples/prometheus-name-override.jsonnet) ```jsonnet -((import "kube-prometheus/kube-prometheus.libsonnet") + { - prometheus+: { - prometheus+: { - metadata+: { - name: "my-name", - } - } - } -}).prometheus.prometheus +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + prometheus+: { + prometheus+: { + metadata+: { + name: 'my-name', + }, + }, + }, + }).prometheus.prometheus ``` Standard Kubernetes manifests are all written using [ksonnet-lib](https://github.com/ksonnet/ksonnet-lib/), so they can be modified with the mixins supplied by ksonnet-lib. For example to override the namespace of the node-exporter DaemonSet: diff --git a/examples/jsonnet-snippets/bootkube.jsonnet b/examples/jsonnet-snippets/bootkube.jsonnet index 89a7eb7b..f7386a01 100644 --- a/examples/jsonnet-snippets/bootkube.jsonnet +++ b/examples/jsonnet-snippets/bootkube.jsonnet @@ -1,2 +1,2 @@ -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-bootkube.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') diff --git a/examples/jsonnet-snippets/kubeadm.jsonnet b/examples/jsonnet-snippets/kubeadm.jsonnet index 591809eb..a7837163 100644 --- a/examples/jsonnet-snippets/kubeadm.jsonnet +++ b/examples/jsonnet-snippets/kubeadm.jsonnet @@ -1,2 +1,2 @@ -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-kubeadm.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') diff --git a/examples/jsonnet-snippets/node-ports.jsonnet b/examples/jsonnet-snippets/node-ports.jsonnet index 68731676..c02f1ae7 100644 --- a/examples/jsonnet-snippets/node-ports.jsonnet +++ b/examples/jsonnet-snippets/node-ports.jsonnet @@ -1,2 +1,2 @@ -(import "kube-prometheus/kube-prometheus.libsonnet") + -(import "kube-prometheus/kube-prometheus-node-ports.libsonnet") +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') diff --git a/examples/prometheus-name-override.jsonnet b/examples/prometheus-name-override.jsonnet index d6410fd8..86218012 100644 --- a/examples/prometheus-name-override.jsonnet +++ b/examples/prometheus-name-override.jsonnet @@ -1,9 +1,9 @@ -((import "kube-prometheus/kube-prometheus.libsonnet") + { - prometheus+: { - prometheus+: { - metadata+: { - name: "my-name", - } - } - } -}).prometheus.prometheus +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + prometheus+: { + prometheus+: { + metadata+: { + name: 'my-name', + }, + }, + }, + }).prometheus.prometheus From 3a7628e8423aa2fa20b433b886f71cbbdd213545 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 18 May 2018 12:20:17 +0200 Subject: [PATCH 261/638] Fix link to Prometheus rules & Grafana dashboards guide --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cc392e7a..6b8cb264 100644 --- a/README.md +++ b/README.md @@ -211,7 +211,7 @@ local daemonset = k.apps.v1beta2.daemonSet; ### Customizing Prometheus alerting/recording rules and Grafana dashboards -See [developing alerts and dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. +See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. ## Example From ef4673705ee2978e956a7061edca53f70dc2dfaf Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 25 May 2018 11:34:01 +0200 Subject: [PATCH 262/638] kube-prometheus/docs: Add guide on exposing via Ingress --- .gitignore | 1 + ...prometheus-alertmanager-grafana-ingress.md | 91 +++++++++++++++++++ examples/auth | 2 + examples/ingress.jsonnet | 45 +++++++++ 4 files changed, 139 insertions(+) create mode 100644 docs/exposing-prometheus-alertmanager-grafana-ingress.md create mode 100644 examples/auth create mode 100644 examples/ingress.jsonnet diff --git a/.gitignore b/.gitignore index 133fdf90..dc2549f2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ tmp/ minikube-manifests/ jsonnetfile.lock.json vendor/ +./auth diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md new file mode 100644 index 00000000..34213067 --- /dev/null +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -0,0 +1,91 @@ +# Exposing Prometheus, Alertmanager and Grafana UIs via Ingress + +In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) project. + +Note: before continuing, it is recommended to first get familiar with the [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) stack by itself. + +## Prerequisites + +Apart from a running Kubernetes cluster with a running [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) stack, a Kubernetes Ingress controller must be installed and functional. This guide was tested with the [nginx-ingress-controller](https://github.com/kubernetes/ingress-nginx). If you wish to reproduce the exact result in as depicted in this guide we recommend using the nginx-ingress-controller. + +## Setting up Ingress + +The setup of Ingress objects is the same for Prometheus, Alertmanager and Grafana. Therefore this guides demonstrates it in detail for Prometheus as it can easily be adapted for the other applications. + +As monitoring data may contain sensitive data, this guide describes how to setup Ingress with basic auth as an example of minimal security. Of course this should be adapted to the preferred authentication mean of any particular organization, but we feel it is important to at least provide an example with a minimum of security. + +In order to setup basic auth, a secret with the `htpasswd` formatted file needs to be created. To do this, first install the [`htpasswd`](https://httpd.apache.org/docs/2.4/programs/htpasswd.html) tool. + +To create the `htpasswd` formatted file called `auth` run: + +``` +htpasswd -c auth +``` + +In order to use this a secret needs to be created containing the name of the `htpasswd`, and with annotations on the Ingress object basic auth can be configured. + +[embedmd]:# (../examples/ingress.jsonnet) +```jsonnet +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local secret = k.core.v1.secret; +local ingress = k.extensions.v1beta1.ingress; +local ingressTls = ingress.mixin.spec.tlsType; +local ingressRule = ingress.mixin.spec.rulesType; +local httpIngressPath = ingressRule.mixin.http.pathsType; + +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + ingress+:: { + 'prometheus-k8s': + ingress.new() + + ingress.mixin.metadata.withName('prometheus-k8s') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'nginx.ingress.kubernetes.io/auth-type': 'basic', + 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost('prometheus.example.com') + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + }, + } + { + ingress+:: { + 'basic-auth-secret': + secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + + secret.mixin.metadata.withNamespace($._config.namespace), + }, + }; + +k.core.v1.list.new([ + kp.ingress['prometheus-k8s'], + kp.ingress['basic-auth-secret'], +]) +``` + +In order to expose Alertmanager and Grafana, simply create additional fields containing an ingress object, but simply pointing at the `alertmanager` or `grafana` instead of the `prometheus-k8s` Service. Make sure to also use the correct port respectively, for Alertmanager it is also `web`, for Grafana it is `http`. + +In order to render the ingress objects similar to the other objects use as demonstrated in the [main readme](../README.md#usage): + +``` +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['ingress-' + name]: kp.ingress[name] for name in std.objectFields(kp.ingress) } +``` + +Note, that in comparison only the last line was added, the rest is identical to the original. diff --git a/examples/auth b/examples/auth new file mode 100644 index 00000000..95692021 --- /dev/null +++ b/examples/auth @@ -0,0 +1,2 @@ +# This file should not ever be used, it's just a mock. +dontusethis:$apr1$heg6VIp7$1PSzJ/Z6fYboQ5pYrbgSy. diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet new file mode 100644 index 00000000..149fea9f --- /dev/null +++ b/examples/ingress.jsonnet @@ -0,0 +1,45 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local secret = k.core.v1.secret; +local ingress = k.extensions.v1beta1.ingress; +local ingressTls = ingress.mixin.spec.tlsType; +local ingressRule = ingress.mixin.spec.rulesType; +local httpIngressPath = ingressRule.mixin.http.pathsType; + +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + ingress+:: { + 'prometheus-k8s': + ingress.new() + + ingress.mixin.metadata.withName('prometheus-k8s') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'nginx.ingress.kubernetes.io/auth-type': 'basic', + 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost('prometheus.example.com') + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + }, + } + { + ingress+:: { + 'basic-auth-secret': + secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + + secret.mixin.metadata.withNamespace($._config.namespace), + }, + }; + +k.core.v1.list.new([ + kp.ingress['prometheus-k8s'], + kp.ingress['basic-auth-secret'], +]) From 1969fada7e75cc6c53b7111e385894d71273392a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 25 May 2018 16:49:19 +0200 Subject: [PATCH 263/638] kube-prometheus: Add table of contents --- README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b8cb264..9256319c 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,19 @@ Components included in this package: This stack is meant for cluster monitoring, so it is pre-configured to collect metrics from all Kubernetes components. In addition to that it delivers a default set of dashboards and alerting rules. Many of the useful dashboards and alerts come from the [kubernetes-mixin project](https://github.com/kubernetes-monitoring/kubernetes-mixin), similar to this project it provides composable jsonnet as a library for users to customize to their needs. +## Table of contents + +* [Prerequisites](#prerequisites) + * [minikube](#minikube) +* [Quickstart](#quickstart) +* [Usage](#usage) + * [Compiling](#compiling) +* [Configuration](#configuration) +* [Customization](#customization) + * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) + * [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) +* [Minikube Example](#minikube-example) + ## Prerequisites You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control. @@ -213,7 +226,11 @@ local daemonset = k.apps.v1beta2.daemonSet; See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. -## Example +### Exposing Prometheus/Alermanager/Grafana via Ingress + +See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertmanager-grafana-ingress.md) guide. + +## Minikube Example To use an easy to reproduce example, let's take the minikube setup as demonstrated in [prerequisites](#Prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: From c11db468631fdac91b475601bc796ccf0eb03eaa Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 25 May 2018 17:01:41 +0200 Subject: [PATCH 264/638] kube-prometheus: Re-generate --- manifests/prometheus-rules.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index d6e5d124..35aaa927 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -253,13 +253,14 @@ data: \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeCertificateExpiration\"\n \"annotations\": + \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n sum(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\",le=\"604800\"}) - > 0\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeCertificateExpiration\"\n + \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) + < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring - in less than 1 day.\"\n \"expr\": |\n sum(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\",le=\"86400\"}) - > 0\n \"labels\": \n \"severity\": \"warning\"" + in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by + (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) + < 86400\n \"labels\": \n \"severity\": \"critical\"" kind: ConfigMap metadata: labels: From da6fc256b56dc5b2074e9aea973c2635fb84b021 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 25 May 2018 18:02:23 +0200 Subject: [PATCH 265/638] Make the prometheus & alertmanager name configurable --- .../alertmanager/alertmanager.libsonnet | 17 +++--- .../prometheus/prometheus.libsonnet | 59 ++++++++++--------- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index f4634703..db370e1d 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -15,6 +15,7 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by }, alertmanager+:: { + name: $._config.alertmanager.name, config: alertmanagerConfig, replicas: 3, }, @@ -24,13 +25,13 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by secret: local secret = k.core.v1.secret; - secret.new('alertmanager-main', { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + secret.mixin.metadata.withNamespace($._config.namespace), serviceAccount: local serviceAccount = k.core.v1.serviceAccount; - serviceAccount.new('alertmanager-main') + + serviceAccount.new('alertmanager-' + $._config.alertmanager.name) + serviceAccount.mixin.metadata.withNamespace($._config.namespace), service: @@ -39,9 +40,9 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by local alertmanagerPort = servicePort.newNamed('web', 9093, 'web'); - service.new('alertmanager-main', { app: 'alertmanager', alertmanager: 'main' }, alertmanagerPort) + + service.new('alertmanager-' + $._config.alertmanager.name, { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, alertmanagerPort) + service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.metadata.withLabels({ alertmanager: 'main' }), + service.mixin.metadata.withLabels({ alertmanager: $._config.alertmanager.name }), serviceMonitor: { @@ -57,7 +58,7 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by spec: { selector: { matchLabels: { - alertmanager: 'main', + alertmanager: $._config.alertmanager.name, }, }, namespaceSelector: { @@ -79,10 +80,10 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by apiVersion: 'monitoring.coreos.com/v1', kind: 'Alertmanager', metadata: { - name: 'main', + name: $._config.alertmanager.name, namespace: $._config.namespace, labels: { - alertmanager: 'main', + alertmanager: $._config.alertmanager.name, }, }, spec: { @@ -90,7 +91,7 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by version: $._config.versions.alertmanager, baseImage: $._config.imageRepos.alertmanager, nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, - serviceAccountName: 'alertmanager-main', + serviceAccountName: 'alertmanager-' + $._config.alertmanager.name, }, }, }, diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index d2ae7ae6..27888eef 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -12,7 +12,12 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus: 'quay.io/prometheus/prometheus', }, + alertmanager+:: { + name: 'main', + }, + prometheus+:: { + name: 'k8s', replicas: 2, rules: {}, renderedRules: {}, @@ -23,7 +28,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; serviceAccount: local serviceAccount = k.core.v1.serviceAccount; - serviceAccount.new('prometheus-k8s') + + serviceAccount.new('prometheus-' + $._config.prometheus.name) + serviceAccount.mixin.metadata.withNamespace($._config.namespace), service: local service = k.core.v1.service; @@ -31,25 +36,25 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local prometheusPort = servicePort.newNamed('web', 9090, 'web'); - service.new('prometheus-k8s', { app: 'prometheus', prometheus: 'k8s' }, prometheusPort) + + service.new('prometheus-' + $._config.prometheus.name, { app: 'prometheus', prometheus: $._config.prometheus.name }, prometheusPort) + service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.metadata.withLabels({ prometheus: 'k8s' }), + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), rules: local configMap = k.core.v1.configMap; configMap.new('prometheus-k8s-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + - configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: 'k8s' }) + + configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: $._config.prometheus.name }) + configMap.mixin.metadata.withNamespace($._config.namespace), roleBindingDefault: local roleBinding = k.rbac.v1.roleBinding; roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.metadata.withNamespace('default') + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), clusterRole: local clusterRole = k.rbac.v1.clusterRole; local policyRule = clusterRole.rulesType; @@ -66,7 +71,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local rules = [nodeMetricsRule, metricsRule]; clusterRole.new() + - clusterRole.mixin.metadata.withName('prometheus-k8s') + + clusterRole.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + clusterRole.withRules(rules), roleConfig: local role = k.rbac.v1.role; @@ -92,26 +97,26 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + roleBinding.mixin.roleRef.withName('prometheus-k8s-config') + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), roleBindingNamespace: local roleBinding = k.rbac.v1.roleBinding; roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.metadata.withNamespace($._config.namespace) + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), clusterRoleBinding: local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('prometheus-k8s') + + clusterRoleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('prometheus-k8s') + + clusterRoleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), roleKubeSystem: local role = k.rbac.v1.role; local policyRule = role.rulesType; @@ -127,7 +132,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withVerbs(['get', 'list', 'watch']); role.new() + - role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + role.mixin.metadata.withNamespace('kube-system') + role.withRules(coreRule), roleDefault: @@ -145,19 +150,19 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withVerbs(['get', 'list', 'watch']); role.new() + - role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + role.mixin.metadata.withNamespace('default') + role.withRules(coreRule), roleBindingKubeSystem: local roleBinding = k.rbac.v1.roleBinding; roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-k8s') + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.metadata.withNamespace('kube-system') + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-k8s') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-k8s', namespace: $._config.namespace }]), + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), roleNamespace: local role = k.rbac.v1.role; local policyRule = role.rulesType; @@ -173,7 +178,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withVerbs(['get', 'list', 'watch']); role.new() + - role.mixin.metadata.withName('prometheus-k8s') + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + role.mixin.metadata.withNamespace($._config.namespace) + role.withRules(coreRule), prometheus: @@ -188,29 +193,29 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; apiVersion: 'monitoring.coreos.com/v1', kind: 'Prometheus', metadata: { - name: 'k8s', + name: $._config.prometheus.name, namespace: $._config.namespace, labels: { - prometheus: 'k8s', + prometheus: $._config.prometheus.name, }, }, spec: { replicas: $._config.prometheus.replicas, version: $._config.versions.prometheus, baseImage: $._config.imageRepos.prometheus, - serviceAccountName: 'prometheus-k8s', + serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: selector.withMatchExpressions({ key: 'k8s-app', operator: 'Exists' }), nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, ruleSelector: selector.withMatchLabels({ role: 'alert-rules', - prometheus: 'k8s', + prometheus: $._config.prometheus.name, }), resources: resources, alerting: { alertmanagers: [ { namespace: $._config.namespace, - name: 'alertmanager-main', + name: 'alertmanager-' + $._config.alertmanager.name, port: 'web', }, ], @@ -231,7 +236,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; spec: { selector: { matchLabels: { - prometheus: 'k8s', + prometheus: $._config.prometheus.name, }, }, namespaceSelector: { From 23cd630d33232bf64188db6ee2129a20dcbd4fe7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 25 May 2018 22:46:58 +0200 Subject: [PATCH 266/638] Change the roleBinding and rules names too --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 27888eef..dbb903a7 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -42,7 +42,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; rules: local configMap = k.core.v1.configMap; - configMap.new('prometheus-k8s-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + + configMap.new('prometheus-' + $._config.prometheus.name + '-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: $._config.prometheus.name }) + configMap.mixin.metadata.withNamespace($._config.namespace), roleBindingDefault: @@ -85,17 +85,17 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withVerbs(['get']); role.new() + - role.mixin.metadata.withName('prometheus-k8s-config') + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name + '-config') + role.mixin.metadata.withNamespace($._config.namespace) + role.withRules(configmapRule), roleBindingConfig: local roleBinding = k.rbac.v1.roleBinding; roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-k8s-config') + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name + '-config') + roleBinding.mixin.metadata.withNamespace($._config.namespace) + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-k8s-config') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name + '-config') + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), roleBindingNamespace: From 7b9d97de7fab0a8e2590c69184c633f581217026 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 May 2018 10:30:37 +0200 Subject: [PATCH 267/638] Remove rules that have been migrated to kubernetes-mixins --- .../rules/kube-controller-manager.rules.yaml | 13 --- .../rules/kube-scheduler.rules.yaml | 58 ---------- .../rules/kube-state-metrics.rules.yaml | 59 ---------- assets/prometheus/rules/kubelet.rules.yaml | 48 -------- assets/prometheus/rules/kubernetes.rules.yaml | 106 ------------------ 5 files changed, 284 deletions(-) delete mode 100644 assets/prometheus/rules/kube-controller-manager.rules.yaml delete mode 100644 assets/prometheus/rules/kube-scheduler.rules.yaml delete mode 100644 assets/prometheus/rules/kube-state-metrics.rules.yaml delete mode 100644 assets/prometheus/rules/kubelet.rules.yaml delete mode 100644 assets/prometheus/rules/kubernetes.rules.yaml diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml deleted file mode 100644 index 4ea82ed1..00000000 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ /dev/null @@ -1,13 +0,0 @@ -groups: -- name: kube-controller-manager.rules - rules: - - alert: K8SControllerManagerDown - expr: absent(up{job="kube-controller-manager"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S controller manager. Deployments and replication - controllers are not making progress. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml deleted file mode 100644 index 8f0c01fd..00000000 --- a/assets/prometheus/rules/kube-scheduler.rules.yaml +++ /dev/null @@ -1,58 +0,0 @@ -groups: -- name: kube-scheduler.rules - rules: - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - alert: K8SSchedulerDown - expr: absent(up{job="kube-scheduler"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S scheduler. New pods are not being assigned - to nodes. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler - summary: Scheduler is down diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml deleted file mode 100644 index 4c7041fe..00000000 --- a/assets/prometheus/rules/kube-state-metrics.rules.yaml +++ /dev/null @@ -1,59 +0,0 @@ -groups: -- name: kube-state-metrics.rules - rules: - - alert: DeploymentGenerationMismatch - expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation - for: 15m - labels: - severity: warning - annotations: - description: Observed deployment generation does not match expected one for - deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment is outdated - - alert: DeploymentReplicasNotUpdated - expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) - or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) - unless (kube_deployment_spec_paused == 1) - for: 15m - labels: - severity: warning - annotations: - description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment replicas are outdated - - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled - * 100 < 100 - for: 15m - labels: - severity: warning - annotations: - description: Only {{$value}}% of desired pods scheduled and ready for daemon - set {{$labels.namespace}}/{{$labels.daemonset}} - summary: DaemonSet is missing pods - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: DaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly - - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 - for: 10m - labels: - severity: warning - annotations: - description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} - times within the last hour - summary: Pod is restarting frequently diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml deleted file mode 100644 index a4168404..00000000 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ /dev/null @@ -1,48 +0,0 @@ -groups: -- name: kubelet.rules - rules: - - alert: K8SNodeNotReady - expr: kube_node_status_condition{condition="Ready",status="true"} == 0 - for: 1h - labels: - severity: warning - annotations: - description: The Kubelet on {{ $labels.node }} has not checked in with the API, - or has set itself to NotReady, for more than an hour - summary: Node status is NotReady - - alert: K8SManyNodesNotReady - expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) - > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == - 0) / count(kube_node_status_condition{condition="Ready",status="true"})) * 100 > 20 - for: 1m - labels: - severity: critical - annotations: - description: '{{ $value }}% of Kubernetes nodes are not ready' - - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 - for: 1h - labels: - severity: warning - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Prometheus failed to scrape - - alert: K8SKubeletDown - expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) - * 100 > 10 - for: 1h - labels: - severity: critical - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets - have disappeared from service discovery. - summary: Many Kubelets cannot be scraped - - alert: K8SKubeletTooManyPods - expr: kubelet_running_pod_count > 100 - for: 10m - labels: - severity: warning - annotations: - description: Kubelet {{$labels.instance}} is running {{$value}} pods, close - to the limit of 110 - summary: Kubelet is close to pod limit diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml deleted file mode 100644 index 288841b7..00000000 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ /dev/null @@ -1,106 +0,0 @@ -groups: -- name: kubernetes.rules - rules: - - record: pod_name:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (pod_name) - - record: pod_name:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: pod_name:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - BY (pod_name) - - record: pod_name:container_fs_usage_bytes:sum - expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: namespace:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) - - record: namespace:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) - - record: namespace:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) - BY (namespace) - - record: cluster:memory_usage:ratio - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (cluster) / sum(machine_memory_bytes) BY (cluster) - - record: cluster:container_spec_cpu_shares:ratio - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 - / sum(machine_cpu_cores) - - record: cluster:container_cpu_usage:ratio - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - / sum(machine_cpu_cores) - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.99" - - record: apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.9" - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.5" - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 1 - for: 10m - labels: - severity: warning - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 4 - for: 10m - labels: - severity: critical - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 2 - for: 10m - labels: - severity: warning - annotations: - description: API server returns errors for {{ $value }}% of requests - summary: API server request errors - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 5 - for: 10m - labels: - severity: critical - annotations: - description: API server returns errors for {{ $value }}% of requests - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 20m - labels: - severity: critical - annotations: - description: No API servers are reachable or all have disappeared from service - discovery - summary: No API servers are reachable - - - alert: K8sCertificateExpirationNotice - labels: - severity: warning - annotations: - description: Kubernetes API Certificate is expiring soon (less than 7 days) - summary: Kubernetes API Certificate is expiering soon - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - - - alert: K8sCertificateExpirationNotice - labels: - severity: critical - annotations: - description: Kubernetes API Certificate is expiring in less than 1 day - summary: Kubernetes API Certificate is expiering - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 From 0a98b8edaffbd6d3ab332104ca1b46d08c5f681e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 May 2018 10:31:21 +0200 Subject: [PATCH 268/638] Remove grafana-dashboards-configmap-generator --- .../README.md | 51 --- .../bin/grafana_dashboards_generate.sh | 427 ------------------ .../output/README.md | 1 - .../templates/ConfigMap.header | 5 - .../templates/dashboard.foot | 0 .../templates/dashboard.header | 0 .../grafana-dashboards-template.yaml | 7 - .../templates/grafana-dashboards/README.md | 1 - .../grafana-deployment-template.yaml | 45 -- 9 files changed, 537 deletions(-) delete mode 100644 hack/grafana-dashboards-configmap-generator/README.md delete mode 100755 hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh delete mode 100644 hack/grafana-dashboards-configmap-generator/output/README.md delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/dashboard.foot delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/dashboard.header delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md delete mode 100644 hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml diff --git a/hack/grafana-dashboards-configmap-generator/README.md b/hack/grafana-dashboards-configmap-generator/README.md deleted file mode 100644 index cc4f51a3..00000000 --- a/hack/grafana-dashboards-configmap-generator/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Grafana Dashboards Configmap Generator - -## Description: -Tool to maintain grafana dashboards' configmap for a grafana deployed with kube-prometheus (a tool inside prometheus-operator). - -The tool reads the content of a directory with grafana .json resources (dashboards and datasources) and creates a manifest file under output/ directory with all the content from the files in a Kubernetes ConfigMap format. - -Based on a configurable size limit, the tool will create 1 or N configmaps to allocate the .json resources (bin packing). If the limit is reached then the configmaps generated will have names like grafana-dashboards-0, grafana-dashboards-1, etc, and if the limit is not reached the configmap generated will be called "grafana-dashboards". - -Input Parameters Allowed: -```bash --i dir, --input-dir dir - Directory with grafana dashboards to process. - Important notes: - Files should be suffixed with -dashboard.json or -datasource.json. - We don't recommend file names with spaces. - --o file, --output-file file - Output file for config maps. - --s NUM, --size-limit NUM - Size limit in bytes for each dashboard (default: 240000) - --n namespace, --namespace namespace - Namespace for the configmap (default: monitoring). - --x, --apply-configmap - Applies the generated configmap with kubectl. - ---apply-type - Type of kubectl command. Accepted values: apply, replace, create (default: apply). -``` - -## Usage - -Just execute the .sh under bin/ directory. The output will be placed in the output/ directory. - -Examples: -```bash -$ ./grafana_dashboards_generate.sh -$ bin/grafana_dashboards_generate.sh -o manifests/grafana/grafana-dashboards.yaml -i assets/grafana-dashboards -$ bin/grafana_dashboards_generate.sh -s 1000000 --apply-configmap --apply-type replace - -# Note: the output file, if provided with -o, shouldn't exist. -``` - -## Configuration and options - -* Put the json files you want to pack in the templates/grafana-dashboards/ directory -* Size limit default is 240000 bytes due to the annotations size limit in kubernetes of 256KB. - diff --git a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh b/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh deleted file mode 100755 index ee6b49e7..00000000 --- a/hack/grafana-dashboards-configmap-generator/bin/grafana_dashboards_generate.sh +++ /dev/null @@ -1,427 +0,0 @@ -#!/bin/bash - -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -# Author: eedugon - -# Description: Tool to maintain grafana dashboards configmap for a grafana deployed -# with kube-prometheus (a tool inside prometheus-operator) -# The tool reads the content of a directory with grafana .json resources -# that need to be moved into a configmap. -# Based on a configurable size limit, the tool will create 1 or N configmaps -# to allocate the .json resources (bin packing) - -# Update: 20170914 -# The tool also generates a grafana deployment manifest (-g option) - -# parameters -# -o, --output-file -# -g, --grafana-manifest-file -# -i, --input-dir -# -s, --size-limit -# -x, --apply-configmap : true or false (default = false) -# --apply-type : create, replace, apply (default = apply) - -# -# Basic Functions -# -echoSyntax() { - echo "Usage: ${0} [options]" - echo "Options:" - echo -e "\t-i dir, --input-dir dir" - echo -e "\t\tDirectory with grafana dashboards to process." - echo -e "\t\tImportant notes:" - echo -e "\t\t\tFiles should be suffixed with -dashboard.json or -datasource.json." - echo -e "\t\t\tWe don't recommend file names with spaces." - echo - echo -e "\t-o file, --output-file file" - echo -e "\t\tOutput file for config maps." - echo - echo -e "\t-s NUM, --size-limit NUM" - echo -e "\t\tSize limit in bytes for each dashboard (default: 240000)" - echo - echo -e "\t-n namespace, --namespace namespace" - echo -e "\t\tNamespace for the configmap (default: monitoring)." - echo - echo -e "\t-x, --apply-configmap" - echo -e "\t\tApplies the generated configmap with kubectl." - echo - echo -e "\t--apply-type" - echo -e "\t\tType of kubectl command. Accepted values: apply, replace, create (default: apply)." -} - - -# # Apply changes --> environment allowed -# test -z "$APPLY_CONFIGMAP" && APPLY_CONFIGMAP="false" -# # Size limit --> environment set allowed -# test -z "$DATA_SIZE_LIMIT" && DATA_SIZE_LIMIT="240000" # in bytes -# # Changes type: in case of problems with k8s configmaps, try replace. Should be apply -# test -z "$APPLY_TYPE" && APPLY_TYPE="apply" -# # Input values verification -# echo "$DATA_SIZE_LIMIT" | grep -q "^[0-9]\+$" || { echo "ERROR: Incorrect value for DATA_SIZE_LIMIT: $DATA_SIZE_LIMIT. Number expected"; exit 1; } - -# Base variables (do not change them) -DATE_EXEC="$(date "+%Y-%m-%d-%H%M%S")" -BIN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -TOOL_HOME="$(dirname $BIN_DIR)" -SCRIPT_BASE=`basename $0 | sed "s/\.[Ss][Hh]//"` -CONFIGMAP_DASHBOARD_PREFIX="grafana-dashboard-definitions" - -TEMPLATES_DIR="$TOOL_HOME/templates" -DASHBOARD_HEADER_FILE="$TEMPLATES_DIR/dashboard.header" -DASHBOARD_FOOT_FILE="$TEMPLATES_DIR/dashboard.foot" -CONFIGMAP_HEADER="$TEMPLATES_DIR/ConfigMap.header" -GRAFANA_DEPLOYMENT_TEMPLATE="$TEMPLATES_DIR/grafana-deployment-template.yaml" -GRAFANA_DASHBOARDS_TEMPLATE="$TEMPLATES_DIR/grafana-dashboards-template.yaml" -OUTPUT_BASE_DIR="$TOOL_HOME/output" - -# Some default values -OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-configMap-$DATE_EXEC.yaml" -GRAFANA_OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-deployment-$DATE_EXEC.yaml" -GRAFANA_DASHBOARD_OUTPUT_FILE="$OUTPUT_BASE_DIR/grafana-dashboards-$DATE_EXEC.yaml" -DASHBOARDS_DIR="$TEMPLATES_DIR/grafana-dashboards" - -APPLY_CONFIGMAP="false" -APPLY_TYPE="apply" -DATA_SIZE_LIMIT="240000" -NAMESPACE="monitoring" - -# Input parameters -while (( "$#" )); do - case "$1" in - "-o" | "--output-file") - OUTPUT_FILE="$2" - shift - ;; - "-g" | "--grafana-output-file") - GRAFANA_OUTPUT_FILE="$2" - shift - ;; - "-d" | "--grafana-dashboard-output-file") - GRAFANA_DASHBOARD_OUTPUT_FILE="$2" - shift - ;; - "-i" | "--input-dir") - DASHBOARDS_DIR="$2" - shift - ;; - "-n" | "--namespace") - NAMESPACE="$2" - shift - ;; - "-x" | "--apply-configmap") - APPLY_CONFIGMAP="true" - ;; - "--apply-type") - APPLY_TYPE="$2" - test "$APPLY_TYPE" != "create" && test "$APPLY_TYPE" != "apply" && test "$APPLY_TYPE" != "replace" && { echo "Unexpected APPLY_TYPE: $APPLY_TYPE"; exit 1; } - shift - ;; - "-s"|"--size-limit") - if ! ( echo $2 | grep -q '^[0-9]\+$') || [ $2 -eq 0 ]; then - echo "Invalid value for size limit '$2'" - exit 1 - fi - DATA_SIZE_LIMIT=$2 - shift - ;; - "-h"|"--help") - echoSyntax - exit 0 - ;; - *) - echo "Unknown argument: $1" - exit 1 - ;; - esac - shift -done - -# -# Auxiliary Functions -# -indentMultiLineString() { - # Indent a given string (in one line including multiple \n) - test "$#" -eq 2 || { echo "INTERNAL ERROR: wrong call to function indentMultiLineString"; exit 1; } - local indent_number="$1" - local string="$2" - - test "$indent_number" -ge 0 || { echo "INTERNAL ERROR: wrong indent number parameter: $indent_number"; exit 1; } - - # prepare indentation text - local indent_string="" - for (( c=0; c<$indent_number; c++ )); do - indent_string="$indent_string " - done - - echo "$string" | sed -e "s#^#$indent_string#" -e "s#\\\n#\\\n$indent_string#g" -} - -# -# Main Functions -# -addConfigMapHeader() { - # If a parameter is provided it will be used as the configmap index. - # If no parameter is provided, the name will be kept - test "$#" -le 1 || { echo "# INTERNAL ERROR: Wrong call to function addConfigMapHeader"; return 1; } - test "$#" -eq 1 && local id="$1" || local id="" - - if [ "$id" ]; then - cat "$CONFIGMAP_HEADER" | sed "s/name: $CONFIGMAP_DASHBOARD_PREFIX/name: $CONFIGMAP_DASHBOARD_PREFIX-$id/" - else - cat "$CONFIGMAP_HEADER" - fi -} - -addArrayToConfigMap() { - # This function process the array to_process into a configmap - local file="" - local OLDIFS=$IFS - local IFS=$'\n' - for file in ${to_process[@]}; do - # check that file exists - test -f "$file" || { echo "# INTERNAL ERROR IN ARRAY: File not found: $file"; continue; } - - # detection of type (dashboard or datasource) - type="" - basename "$file" | grep -q "\-datasource" && type="datasource" - basename "$file" | grep -q "\-dashboard" && type="dashboard" - test "$type" || { echo "# ERROR: Unrecognized file type: $(basename $file)"; return 1; } - - #echo "# Processing $type $file" - # Indent 2 - echo " $(basename $file): |+" - - # Dashboard header: No indent needed - test "$type" = "dashboard" && cat $DASHBOARD_HEADER_FILE - - # File content: Indent 4 - cat $file | sed "s/^/ /" - - # If source file was not ended properly we add newline character - [ "$(tail -c 1 "$file")" ] && echo - - # Dashboard foot - test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE - done - echo "---" - - IFS=$OLDIFS - return 0 -} - -initialize-bin-pack() { - # We separate initialization to reuse the bin-pack for different sets of files. - n="0" - to_process=() - bytes_to_process="0" - total_files_processed="0" - total_configmaps_created="0" -} - -bin-pack-files() { - # Algorithm: - # We process the files with no special order consideration - # We create an array/queue of "files to add to configmap" called "to_process" - # Size of the file is analyzed to determine if it can be added to the queue or not. - # the max size of the queue is limited by DATA_SIZE_LIMIT - # while there's room available in the queue we add files. - # when there's no room we create a configmap with the members of the queue - # before adding the file to a cleaned queue - - # Counters initialization is not in the scope of this function - local file="" - OLDIFS=$IFS - IFS=$'\n' -# echo "DEBUG bin-pack:" -# echo "$@" - - for file in $@; do - test -f "$file" || { echo "# INTERNAL ERROR: File not found: $file"; continue; } -# echo "debug: Processing file $(basename $file)" - - file_size_bytes="$(stat -c%s "$file")" || true - - # If the file is bigger than the configured limit we skip it file - if [ "$file_size_bytes" -gt "$DATA_SIZE_LIMIT" ]; then - echo "ERROR: File $(basename $file) bigger than size limit: $DATA_SIZE_LIMIT ($file_size_bytes). Skipping" - continue - fi - (( total_files_processed++ )) || true - - if test "$(expr "$bytes_to_process" + "$file_size_bytes")" -le "$DATA_SIZE_LIMIT"; then - # We have room to include the file in the configmap - # test "$to_process" && to_process="$to_process $file" || to_process="$file" - to_process+=("$file") - (( bytes_to_process = bytes_to_process + file_size_bytes )) || true - echo "# File $(basename $file) : added to queue" - else - # There's no room to add this file to the queue. so we process what we have and add the file to the queue - if [ "$to_process" ]; then - echo - echo "# Size limit ($DATA_SIZE_LIMIT) reached. Processing queue with $bytes_to_process bytes. Creating configmap with id $n" - echo - # Create a new configmap - addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } - addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } - # Initialize variables with info about file not processed - (( total_configmaps_created++ )) || true - (( n++ )) || true - # to_process="$file" - to_process=() - to_process+=("$file") - bytes_to_process="$file_size_bytes" - echo "# File $(basename $file) : added to queue" - else - # based on the algorithm the queue should never be empty if we reach this part of the code - # if this happens maybe bytes_to_process was not aligned with the queue (to_process) - echo "ERROR (unexpected)" - fi - fi - done - IFS=$OLDIFS -} - -# prepareGrafanaDeploymentManifest() { -# local num_configmaps="$1" -# -# for (( i=0; i<$total_configmaps_created; i++ )); do -# echo "Creating deployment for $CONFIGMAP_DASHBOARD_PREFIX-$i" -# -# done -# } - - -# Some variables checks... -test ! -d "$TEMPLATES_DIR" && { echo "ERROR: missing templates directory $TEMPLATES_DIR"; exit 1; } - -test -f "$DASHBOARD_FOOT_FILE" || { echo "Template $DASHBOARD_FOOT_FILE not found"; exit 1; } -test -f "$DASHBOARD_HEADER_FILE" || { echo "Template $DASHBOARD_HEADER_FILE not found"; exit 1; } -test -f "$CONFIGMAP_HEADER" || { echo "Template $CONFIGMAP_HEADER not found"; exit 1; } -test -f "$GRAFANA_DEPLOYMENT_TEMPLATE" || { echo "Template $GRAFANA_DEPLOYMENT_TEMPLATE not found"; exit 1; } -test -f "$GRAFANA_DASHBOARDS_TEMPLATE" || { echo "Template $GRAFANA_DEPLOYMENT_TEMPLATE not found"; exit 1; } - -test ! -d "$OUTPUT_BASE_DIR" && { echo "ERROR: missing directory $OUTPUT_BASE_DIR"; exit 1; } - -# Initial checks -test -d "$DASHBOARDS_DIR" || { echo "ERROR: Dashboards directory not found: $DASHBOARDS_DIR"; echoSyntax; exit 1; } - -test -f "$OUTPUT_FILE" && { echo "ERROR: Output file already exists: $OUTPUT_FILE"; exit 1; } -test -f "$GRAFANA_OUTPUT_FILE" && { echo "ERROR: Output file already exists: $GRAFANA_OUTPUT_FILE"; exit 1; } -test -f "$GRAFANA_DASHBOARD_OUTPUT_FILE" && { echo "ERROR: Output file already exists: $GRAFANA_DASHBOARD_OUTPUT_FILE"; exit 1; } -touch $OUTPUT_FILE || { echo "ERROR: Unable to create or modify $OUTPUT_FILE"; exit 1; } -touch $GRAFANA_OUTPUT_FILE || { echo "ERROR: Unable to create or modify $GRAFANA_OUTPUT_FILE"; exit 1; } - -# Main code start - -echo "# Starting execution of $SCRIPT_BASE on $DATE_EXEC" -echo "# Configured size limit: $DATA_SIZE_LIMIT bytes" -echo "# Grafana input dashboards and datasources will be read from: $DASHBOARDS_DIR" -echo "# Grafana Dashboards ConfigMap will be created into file:" -echo "$OUTPUT_FILE" -echo "# Grafana Deployment manifest will be created into file:" -echo "$GRAFANA_OUTPUT_FILE" -echo - -# Loop variables initialization -initialize-bin-pack - -# Process dashboards -bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-dashboard.json" | sort)" - -# Continue processing datasources (maintaining the same queue) -# -# Commented out, as datasources are provisionable by Grafana by default in Grafana v5, but from a separate directory, meaning a separate ConfigMap for us. -# -# bin-pack-files "$(find $DASHBOARDS_DIR -maxdepth 1 -type f -name "*-datasource.json" | sort )" - -# Processing remaining data in the queue (or unique) -if [ "$to_process" ]; then - if [ "$n" -eq 0 ]; then - echo - echo "# Size limit not reached ($bytes_to_process). Adding all files into basic configmap" - echo - addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } - else - echo - echo "# Size limit not reached ($bytes_to_process). Adding remaining files into configmap with id $n" - echo - addConfigMapHeader $n >> $OUTPUT_FILE || { echo "ERROR in call to addConfigMapHeader function"; exit 1; } - fi - addArrayToConfigMap >> $OUTPUT_FILE || { echo "ERROR in call to addArrayToConfigMap function"; exit 1; } - (( total_configmaps_created++ )) || true - to_process=() -fi - -echo "# Process completed, configmap created: $(basename $OUTPUT_FILE)" -echo "# Summary" -echo "# Total files processed: $total_files_processed" -echo "# Total amount of ConfigMaps inside the manifest: $total_configmaps_created" -echo -# Grafana deployment Processing (for every configmap) -#prepareGrafanaDeploymentManifest "$total_configmaps_created" -VOLUMES="" -VOLUME_MOUNTS="" -WATCH_DIR="" -for (( i=0; i<$total_configmaps_created; i++ )); do - configmap="$CONFIGMAP_DASHBOARD_PREFIX-$i" - echo "# Preparing grafana deployment to support configmap: $configmap" - - test "$VOLUME_MOUNTS" && VOLUME_MOUNTS="$VOLUME_MOUNTS\n- name: $configmap\n mountPath: /grafana-dashboard-definitions/$i" || VOLUME_MOUNTS="- name: $configmap\n mountPath: /grafana-dashboard-definitions/$i" - test "$VOLUMES" && VOLUMES="$VOLUMES\n- name: $configmap\n configMap:\n name: $configmap" || VOLUMES="- name: $configmap\n configMap:\n name: $configmap" - test "$WATCH_DIR" && WATCH_DIR="$WATCH_DIR\n- '--watch-dir=/var/$configmap'" || WATCH_DIR="- '--watch-dir=/var/$configmap'" - # echo "DEBUG:" - # echo "VOLUMES: $VOLUMES" - # echo "VOLUME_MOUNTS: $VOLUME_MOUNTS" - # echo "WATCH_DIR: $WATCH_DIR" - echo -done - -echo "# Processing grafana deployment template into $GRAFANA_OUTPUT_FILE" -sed -e "s#XXX_VOLUMES_XXX#$(indentMultiLineString 6 "$VOLUMES")#" \ - -e "s#XXX_VOLUME_MOUNTS_XXX#$(indentMultiLineString 8 "$VOLUME_MOUNTS")#" \ - -e "s#XXX_WATCH_DIR_XXX#$(indentMultiLineString 10 "$WATCH_DIR")#" \ - $GRAFANA_DEPLOYMENT_TEMPLATE > $GRAFANA_OUTPUT_FILE - -echo -echo "# Generating Grafana dashboard sources file for $total_configmaps_created directories" -DASHBOARD_SOURCES="" -for (( j=0; j<$total_configmaps_created; j++ )); do - echo "# Preparing grafana dashboards sources to support configmap: /grafana-dashboard-definitions/$j" - test "$DASHBOARD_SOURCES" && DASHBOARD_SOURCES="$DASHBOARD_SOURCES\n- name: '$j'\n org_id: 1\n folder: ''\n type: file\n options:\n folder: /grafana-dashboard-definitions/$j" || DASHBOARD_SOURCES="- name: '$j'\n org_id: 1\n folder: ''\n type: file\n options:\n folder: /grafana-dashboard-definitions/$j" - - # echo "DEBUG:" - # echo "DASHBOARD_SOURCES: $DASHBOARD_SOURCES" - echo -done - -echo "# Processing grafana dashboards template into $GRAFANA_DASHBOARD_OUTPUT_FILE" -sed -e "s#XXX_DASHBOARDS_XXX#$(indentMultiLineString 4 "$DASHBOARD_SOURCES")#" \ - $GRAFANA_DASHBOARDS_TEMPLATE > $GRAFANA_DASHBOARD_OUTPUT_FILE - -# If output file is empty we can delete it and exit -test ! -s "$OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $OUTPUT_FILE; exit 0; } -test ! -s "$GRAFANA_OUTPUT_FILE" && { echo "# Configmap empty, deleting file"; rm $GRAFANA_OUTPUT_FILE; exit 0; } - -if [ "$APPLY_CONFIGMAP" = "true" ]; then - test -x "$(which kubectl)" || { echo "ERROR: kubectl command not available. Apply configmap not possible"; exit 1; } - echo "# Applying configuration with $APPLY_TYPE method on namespace $NAMESPACE" - if kubectl -n $NAMESPACE $APPLY_TYPE -f "$OUTPUT_FILE"; then - echo - echo "# ConfigMap updated. Updating grafana deployment" - kubectl -n $NAMESPACE $APPLY_TYPE -f "$GRAFANA_OUTPUT_FILE" || { echo "Error applying Grafana deployment. Check yaml file: $GRAFANA_OUTPUT_FILE"; exit 1; } - else - echo "Error applying Configmap. Check yaml file: $OUTPUT_FILE" - fi -else - echo - echo "# To apply the new configMap to your k8s system do something like:" - echo "kubectl -n monitoring $APPLY_TYPE -f $OUTPUT_FILE" - echo "kubectl -n monitoring $APPLY_TYPE -f $GRAFANA_OUTPUT_FILE" - echo -fi diff --git a/hack/grafana-dashboards-configmap-generator/output/README.md b/hack/grafana-dashboards-configmap-generator/output/README.md deleted file mode 100644 index d2e3c8a6..00000000 --- a/hack/grafana-dashboards-configmap-generator/output/README.md +++ /dev/null @@ -1 +0,0 @@ -### This directory will include all generated manifests if no specific options are given diff --git a/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header b/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header deleted file mode 100644 index 73a14b05..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/ConfigMap.header +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-definitions -data: diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot b/hack/grafana-dashboards-configmap-generator/templates/dashboard.foot deleted file mode 100644 index e69de29b..00000000 diff --git a/hack/grafana-dashboards-configmap-generator/templates/dashboard.header b/hack/grafana-dashboards-configmap-generator/templates/dashboard.header deleted file mode 100644 index e69de29b..00000000 diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml deleted file mode 100644 index a8b00982..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards-template.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards -data: - dashboards.yaml: |+ -XXX_DASHBOARDS_XXX diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md b/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md deleted file mode 100644 index 69be0eec..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-dashboards/README.md +++ /dev/null @@ -1 +0,0 @@ -# Add your grafana dashboards into this directory diff --git a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml b/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml deleted file mode 100644 index 9b978e90..00000000 --- a/hack/grafana-dashboards-configmap-generator/templates/grafana-deployment-template.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: apps/v1beta1 -kind: Deployment -metadata: - name: grafana -spec: - replicas: 1 - template: - metadata: - labels: - app: grafana - spec: - securityContext: - runAsNonRoot: true - runAsUser: 65534 - containers: - - name: grafana - image: quay.io/coreos/monitoring-grafana:5.0.3 - volumeMounts: - - name: grafana-storage - mountPath: /data - - name: grafana-datasources - mountPath: /grafana/conf/provisioning/datasources - - name: grafana-dashboards - mountPath: /grafana/conf/provisioning/dashboards -XXX_VOLUME_MOUNTS_XXX - ports: - - name: web - containerPort: 3000 - resources: - requests: - memory: 100Mi - cpu: 100m - limits: - memory: 200Mi - cpu: 200m - volumes: - - name: grafana-storage - emptyDir: {} - - name: grafana-datasources - configMap: - name: grafana-datasources - - name: grafana-dashboards - configMap: - name: grafana-dashboards -XXX_VOLUMES_XXX From 25a3ae65b2bd6b65d6cf248124ceaf7bfc8aad31 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 May 2018 16:53:51 +0200 Subject: [PATCH 269/638] Remove static alertmanager config --- assets/alertmanager/alertmanager.yaml | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 assets/alertmanager/alertmanager.yaml diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml deleted file mode 100644 index 6b5789b5..00000000 --- a/assets/alertmanager/alertmanager.yaml +++ /dev/null @@ -1,14 +0,0 @@ -global: - resolve_timeout: 5m -route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: DeadMansSwitch - receiver: 'null' -receivers: -- name: 'null' From 309974fadb968e7a776bd0506d100cfc84a72a1e Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 May 2018 16:54:06 +0200 Subject: [PATCH 270/638] Remove old python script to generate Grafana dashboards --- assets/grafana/_grafanalib.py | 91 -- assets/grafana/deployment.dashboard.py | 467 ------- assets/grafana/generated/.gitignore | 2 - assets/grafana/generated/.keep | 0 .../kubernetes-capacity-planning.dashboard.py | 465 ------- .../kubernetes-cluster-health.dashboard.py | 405 ------ .../kubernetes-cluster-status.dashboard.py | 450 ------- ...bernetes-control-plane-status.dashboard.py | 344 ----- .../kubernetes-resource-requests.dashboard.py | 205 --- assets/grafana/nodes.dashboard.py | 423 ------ assets/grafana/pods.dashboard.py | 255 ---- assets/grafana/prometheus-datasource.json | 7 - .../raw-json-dashboards/etcd-dashboard.json | 1158 ----------------- assets/grafana/statefulset.dashboard.py | 440 ------- 14 files changed, 4712 deletions(-) delete mode 100644 assets/grafana/_grafanalib.py delete mode 100644 assets/grafana/deployment.dashboard.py delete mode 100644 assets/grafana/generated/.gitignore delete mode 100644 assets/grafana/generated/.keep delete mode 100644 assets/grafana/kubernetes-capacity-planning.dashboard.py delete mode 100644 assets/grafana/kubernetes-cluster-health.dashboard.py delete mode 100644 assets/grafana/kubernetes-cluster-status.dashboard.py delete mode 100644 assets/grafana/kubernetes-control-plane-status.dashboard.py delete mode 100644 assets/grafana/kubernetes-resource-requests.dashboard.py delete mode 100644 assets/grafana/nodes.dashboard.py delete mode 100644 assets/grafana/pods.dashboard.py delete mode 100644 assets/grafana/prometheus-datasource.json delete mode 100644 assets/grafana/raw-json-dashboards/etcd-dashboard.json delete mode 100644 assets/grafana/statefulset.dashboard.py diff --git a/assets/grafana/_grafanalib.py b/assets/grafana/_grafanalib.py deleted file mode 100644 index b304809d..00000000 --- a/assets/grafana/_grafanalib.py +++ /dev/null @@ -1,91 +0,0 @@ -from grafanalib import core -from grafanalib.core import Graph, Time, SparkLine, \ - Gauge, Templating, XAxis, YAxes - - -def Dashboard( - title, version, time, rows, graphTooltip=0, templating=None, -): - optional_args = {} - if templating is not None: - optional_args['templating'] = templating - return core.Dashboard( - title=title, refresh=None, schemaVersion=14, - version=version, time=time, timezone='browser', inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], rows=rows, graphTooltip=graphTooltip, editable=False, **optional_args, - ) - - -def Row( - panels, height=None, title='Dashboard Row', showTitle=False -): - assert isinstance(height, (type(None), int)) - return core.Row( - panels=panels, height=height, title=title, showTitle=showTitle, - titleSize='h6', editable=False, - ) - - -def SingleStat( - title, id, targets, colorValue=False, gauge=Gauge(show=True), - valueFontSize='80%', thresholds=None, valueName='avg', valueMaps=None, - rangeMaps=None, mappingTypes=None, mappingType=None, postfix=None, - sparkline=SparkLine(), prefixFontSize='50%', colors=[ - (50, 172, 45, 0.97), - (237, 129, 40, 0.89), - (245, 54, 54, 0.9), - ], span=None, format='none', transparent=None, -): - def merge_target(target): - return {**{ - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, **target} - targets = [merge_target(t) for t in targets] - - return core.SingleStat( - title=title, id=id, colorValue=colorValue, - dataSource='prometheus', gauge=gauge, - valueFontSize=valueFontSize, thresholds=thresholds, - valueName=valueName, valueMaps=valueMaps, rangeMaps=rangeMaps, - mappingTypes=mappingTypes, targets=targets, - mappingType=mappingType, format=format, colors=colors, span=span, - postfix=postfix, sparkline=sparkline, prefixFontSize=prefixFontSize, - hideTimeOverride=None, transparent=transparent, editable=False, - ) - - -def Graph( - id, title, targets, dashLength=None, dashes=False, spaceLength=None, - xAxis=None, yAxes=None, nullPointMode='connected', -): - def merge_target(target): - return {**{ - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, **target} - - targets = [merge_target(t) for t in targets] - assert isinstance(yAxes, YAxes) - return core.Graph( - id=id, title=title, dashLength=dashLength, dashes=dashes, - spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes, - dataSource='prometheus', nullPointMode=nullPointMode, editable=False, - ) - - -def YAxis(format='none', label='', min=0, show=True): - return core.YAxis( - format=format, label=label, min=min, show=show - ) diff --git a/assets/grafana/deployment.dashboard.py b/assets/grafana/deployment.dashboard.py deleted file mode 100644 index 6cecd4bf..00000000 --- a/assets/grafana/deployment.dashboard.py +++ /dev/null @@ -1,467 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='Deployment', - version=1, - graphTooltip=1, - time=Time(start='now-6h'), - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Namespace', - 'multi': False, - 'name': 'deployment_namespace', - 'options': [], - 'query': 'label_values(kube_deployment_metadata_generation, ' - 'namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': None, - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Deployment', - 'multi': False, - 'name': 'deployment_name', - 'options': [], - 'query': 'label_values(kube_deployment_metadata_generation' - '{namespace="$deployment_namespace"}, deployment)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': 'deployment', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row(panels=[ - SingleStat( - title='CPU', - id=8, - gauge=Gauge(show=False), - postfix='cores', - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - targets=[ - { - 'expr': 'sum(rate(container_cpu_usage_seconds_total' - '{namespace=\"$deployment_namespace\",pod_name=~\"' - '$deployment_name.*\"}[3m]))', - }, - ], - ), - SingleStat( - title='Memory', - id=9, - postfix='GB', - prefixFontSize='80%', - gauge=Gauge(show=False), - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(container_memory_usage_bytes{namespace=' - '\"$deployment_namespace\",pod_name=~\"$' - 'deployment_name.*\"}) / 1024^3', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Network', - format='Bps', - gauge=Gauge(thresholdMarkers=False), - id=7, - postfix='', - span=4, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(rate(container_network_transmit_' - 'bytes_total' - '{namespace=\"$deployment_namespace\",pod_name=~\"' - '$deployment_name.*\"}[3m])) + ' - 'sum(rate(container_network_receive_bytes_total' - '{namespace=\"$deployment_namespace\",pod_name=~' - '\"$deployment_name.*\"}[3m]))', - }, - ], - ), - ], - height=200, - ), - Row( - height=100, panels=[ - SingleStat( - title='Desired Replicas', - id=5, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - span=3, - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'metric': 'kube_deployment_spec_replicas', - 'expr': 'max(kube_deployment_spec_replicas' - '{deployment="$deployment_name",namespace=' - '"$deployment_namespace"}) without ' - '(instance, pod)', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - gauge=Gauge(thresholdMarkers=False, show=False), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - ), - SingleStat( - title='Available Replicas', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=6, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'min(kube_deployment_status_replicas_' - 'available{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Observed Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(), - id=3, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_deployment_status_observed_' - 'generation{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': "null", - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Metadata Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=2, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_deployment_metadata_generation' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without (instance, ' - 'pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ], - ), - Row( - height=350, panels=[ - Graph( - title='Replicas', - dashLength=10, - dashes=False, - id=1, - spaceLength=10, - targets=[ - { - 'expr': 'max(kube_deployment_status_replicas' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without (instance, ' - 'pod)', - 'legendFormat': 'current replicas', - 'refId': 'A', - 'step': 30, - }, - { - 'expr': 'min(kube_deployment_status_replicas_' - 'available{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'available', - 'refId': 'B', - 'step': 30, - }, - { - 'expr': 'max(kube_deployment_status_replicas_' - 'unavailable{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'unavailable', - 'refId': 'C', - 'step': 30, - }, - { - 'expr': 'min(kube_deployment_status_replicas_' - 'updated{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'updated', - 'refId': 'D', - 'step': 30, - }, - { - 'expr': 'max(kube_deployment_spec_replicas' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'desired', - 'refId': 'E', - 'step': 30, - } - ], - xAxis=XAxis(mode='time'), - yAxes=YAxes( - YAxis(min=None), - YAxis(format='short', min=None, show=False), - ), - ), - ] - ), - ], -) diff --git a/assets/grafana/generated/.gitignore b/assets/grafana/generated/.gitignore deleted file mode 100644 index 92063fdc..00000000 --- a/assets/grafana/generated/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*-dashboard.json -*-datasource.json diff --git a/assets/grafana/generated/.keep b/assets/grafana/generated/.keep deleted file mode 100644 index e69de29b..00000000 diff --git a/assets/grafana/kubernetes-capacity-planning.dashboard.py b/assets/grafana/kubernetes-capacity-planning.dashboard.py deleted file mode 100644 index 9b02010a..00000000 --- a/assets/grafana/kubernetes-capacity-planning.dashboard.py +++ /dev/null @@ -1,465 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Capacity Planning', - version=4, - gnetId=22, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-1h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus', - } - ], - rows=[ - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Idle CPU', - id=3, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percent', label='cpu usage',), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_cpu{mode=\"idle\"}[2m])) ' - '* 100', - 'hide': False, - 'intervalFactor': 10, - 'legendFormat': '', - 'refId': 'A', - 'step': 50, - }, - ], - ), - Graph( - title='System Load', - id=9, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percentunit', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(node_load1)', - 'intervalFactor': 4, - 'legendFormat': 'load 1m', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'sum(node_load5)', - 'intervalFactor': 4, - 'legendFormat': 'load 5m', - 'refId': 'B', - 'step': 20, - 'target': '' - }, - { - 'expr': 'sum(node_load15)', - 'intervalFactor': 4, - 'legendFormat': 'load 15m', - 'refId': 'C', - 'step': 20, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - id=4, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=9, - stack=True, - seriesOverrides=[ - { - 'alias': 'node_memory_SwapFree{instance=' - '\"172.17.0.1:9100\",job=\"prometheus\"}', - 'yaxis': 2, - } - ], - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='bytes', min='0'), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(node_memory_MemTotal) - sum(node_' - 'memory_MemFree) - sum(node_memory_Buffers) - ' - 'sum(node_memory_Cached)', - 'intervalFactor': 2, - 'legendFormat': 'memory usage', - 'metric': 'memo', - 'refId': 'A', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_Buffers)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory buffers', - 'metric': 'memo', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_Cached)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory cached', - 'metric': 'memo', - 'refId': 'C', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_MemFree)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory free', - 'metric': 'memo', - 'refId': 'D', - 'step': 10, - 'target': '', - }, - ], - ), - SingleStat( - title='Memory Usage', - dataSource='prometheus', - id=5, - format='percent', - span=3, - gauge=Gauge(show=True), - editable=False, - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '((sum(node_memory_MemTotal) - ' - 'sum(node_memory_MemFree) - sum(' - 'node_memory_Buffers) - sum(node_memory_Cached)) ' - '/ sum(node_memory_MemTotal)) * 100', - 'intervalFactor': 2, - 'metric': '', - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=246, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Disk I/O', - dataSource='prometheus', - id=6, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=9, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'read', - 'yaxis': 1 - }, - { - 'alias': '{instance=\"172.17.0.1:9100\"}', - 'yaxis': 2, - }, - { - 'alias': 'io time', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='ms', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_disk_bytes_read[5m]))', - 'hide': False, - 'intervalFactor': 4, - 'legendFormat': 'read', - 'refId': 'A', - 'step': 20, - 'target': '' - }, - { - 'expr': 'sum(rate(node_disk_bytes_written[5m]))', - 'intervalFactor': 4, - 'legendFormat': 'written', - 'refId': 'B', - 'step': 20 - }, - { - 'expr': 'sum(rate(node_disk_io_time_ms[5m]))', - 'intervalFactor': 4, - 'legendFormat': 'io time', - 'refId': 'C', - 'step': 20 - }, - ], - ), - SingleStat( - title='Disk Space Usage', - dataSource='prometheus', - id=12, - span=3, - editable=False, - format='percentunit', - valueName='current', - gauge=Gauge( - maxValue=1, - show=True, - ), - thresholds='0.75, 0.9', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '\"rootfs\"}) - sum(node_filesystem_free{' - 'device!=\"rootfs\"})) / sum(node_filesystem_size' - '{device!=\"rootfs\"})', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ] - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Network Received', - dataSource='prometheus', - id=8, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_network_receive_bytes' - '{device!~\"lo\"}[5m]))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 10, - 'target': '', - }, - ], - ), - Graph( - title='Network Transmitted', - dataSource='prometheus', - id=10, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_network_transmit_bytes' - '{device!~\"lo\"}[5m]))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=276, title='New Row', showTitle=False, editable=False, - titleSize='h6', - panels=[ - Graph( - title='Cluster Pod Utilization', - dataSource='prometheus', - id=11, - span=9, - dashes=False, - editable=False, - spaceLength=11, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(kube_pod_info)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Current number of Pods', - 'refId': 'A', - 'step': 10, - }, - { - 'expr': 'sum(kube_node_status_capacity_pods)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Maximum capacity of pods', - 'refId': 'B', - 'step': 10, - } - ], - ), - SingleStat( - title='Pod Utilization', - dataSource='prometheus', - id=7, - editable=False, - span=3, - format='percent', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - gauge=Gauge( - show=True, - ), - thresholds='80, 90', - valueName='current', - targets=[ - { - 'expr': '100 - (sum(kube_node_status_capacity_' - 'pods) - sum(kube_pod_info)) / sum(kube_node_' - 'status_capacity_pods) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ] - ), - ], -) diff --git a/assets/grafana/kubernetes-cluster-health.dashboard.py b/assets/grafana/kubernetes-cluster-health.dashboard.py deleted file mode 100644 index 7f1cfe64..00000000 --- a/assets/grafana/kubernetes-cluster-health.dashboard.py +++ /dev/null @@ -1,405 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Cluster Health', - version=9, - graphTooltip=0, - schemaVersion=14, - editable=False, - time=Time(start='now-6h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - height=254, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - SingleStat( - title='Control Plane Components Down', - id=1, - dataSource='prometheus', - gauge=Gauge(), - span=3, - thresholds='1, 3', - colorValue=True, - editable=False, - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'Everything UP and healthy', - 'value': 'null', - }, - { - 'op': '=', - 'text': '', - 'value': '', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' - 'kube-controller-manager"} == 0)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Alerts Firing', - id=2, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="firing",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Alerts Pending', - id=3, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='3, 5', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="pending",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Crashlooping Pods', - id=4, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'count(increase(kube_pod_container_' - 'status_restarts[1h]) > 5)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - ], - ), - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - SingleStat( - title='Node Not Ready', - id=5, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition{' - 'condition="Ready",status!="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Node Disk Pressure', - id=6, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition' - '{condition="DiskPressure",status="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Node Memory Pressure', - id=7, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition' - '{condition="MemoryPressure",status="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Nodes Unschedulable', - id=8, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_spec_unschedulable)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-cluster-status.dashboard.py b/assets/grafana/kubernetes-cluster-status.dashboard.py deleted file mode 100644 index 7288c522..00000000 --- a/assets/grafana/kubernetes-cluster-status.dashboard.py +++ /dev/null @@ -1,450 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='Kubernetes Cluster Status', - version=3, - time=Time(start='now-6h'), - rows=[ - Row( - height=129, title='Cluster Health', showTitle=True, - panels=[ - SingleStat( - title='Control Plane UP', - id=5, - gauge=Gauge(show=False), - colorValue=True, - mappingType=1, - thresholds='1, 3', - valueName='total', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'UP', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' - 'kube-controller-manager"} == 0)', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Alerts Firing', - id=6, - gauge=Gauge(show=False), - colorValue=True, - mappingType=1, - thresholds='3, 5', - valueName='current', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="firing",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - }, - ] - ), - ], - ), - Row( - height=168, title='Control Plane Status', showTitle=True, - panels=[ - SingleStat( - title='API Servers UP', - id=1, - mappingType=1, - format='percent', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - thresholds='50, 80', - span=3, - valueName='current', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': '(sum(up{job="apiserver"} == 1) / ' - 'count(up{job="apiserver"})) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Controller Managers UP', - id=2, - span=3, - mappingType=1, - thresholds='50, 80', - format='percent', - valueName='current', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(up{job="kube-controller-manager"} ==' - ' 1) / count(up{job="kube-controller-manager"})) ' - '* 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Schedulers UP', - id=3, - span=3, - mappingType=1, - format='percent', - thresholds='50, 80', - valueName='current', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(up{job="kube-scheduler"} == 1) / ' - 'count(up{job="kube-scheduler"})) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Crashlooping Control Plane Pods', - id=4, - colorValue=True, - gauge=Gauge(show=False), - span=3, - mappingType=1, - thresholds='1, 3', - valueName='current', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': 'count(increase(kube_pod_container_' - 'status_restarts{namespace=~"kube-system|' - 'tectonic-system"}[1h]) > 5)', - 'format': 'time_series', - }, - ] - ), - ], - ), - Row( - height=158, title='Capacity Planning', showTitle=True, - panels=[ - SingleStat( - title='CPU Utilization', - id=8, - format='percent', - mappingType=1, - span=3, - thresholds='80, 90', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'sum(100 - (avg by (instance) (rate(' - 'node_cpu{job="node-exporter",mode="idle"}[5m])) ' - '* 100)) / count(node_cpu{job="node-exporter",' - 'mode="idle"})', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Memory Utilization', - id=7, - format='percent', - span=3, - mappingType=1, - thresholds='80, 90', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '((sum(node_memory_MemTotal) - sum(' - 'node_memory_MemFree) - sum(node_memory_Buffers) ' - '- sum(node_memory_Cached)) / sum(' - 'node_memory_MemTotal)) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Filesystem Utilization', - id=9, - span=3, - format='percent', - mappingType=1, - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '"rootfs"}) - sum(node_filesystem_free{device!=' - '"rootfs"})) / sum(node_filesystem_size{device!=' - '"rootfs"})', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Pod Utilization', - id=10, - gauge=Gauge(show=True), - span=3, - mappingType=1, - format='percent', - thresholds='80, 90', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '100 - (sum(kube_node_status_capacity_pods' - ') - sum(kube_pod_info)) / sum(kube_node_status_' - 'capacity_pods) * 100', - 'format': 'time_series', - }, - ] - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-control-plane-status.dashboard.py b/assets/grafana/kubernetes-control-plane-status.dashboard.py deleted file mode 100644 index d2f35129..00000000 --- a/assets/grafana/kubernetes-control-plane-status.dashboard.py +++ /dev/null @@ -1,344 +0,0 @@ -from grafanalib.core import * - -dashboard = Dashboard( - title='Kubernetes Control Plane Status', - version=3, - graphTooltip=0, - schemaVersion=14, - time=Time(start='now-6h'), - timezone='browser', - refresh=None, - editable=False, - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - SingleStat( - title='API Servers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=1, - span=3, - thresholds='50, 80', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"apiserver\"} == 1) / ' - 'sum(up{job=\"apiserver\"})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ] - ), - SingleStat( - title='Controller Managers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=2, - span=3, - thresholds='50, 80', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"kube-controller-manager\"}' - ' == 1) / sum(up{job=\"kube-controller-manager\"' - '})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - } - ] - ), - SingleStat( - title='Schedulers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=3, - span=3, - thresholds='50, 80', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"kube-scheduler\"} == 1) ' - '/ sum(up{job=\"kube-scheduler\"})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - } - ] - ), - SingleStat( - title='API Server Request Error Rate', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=4, - span=3, - thresholds='5, 10', - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'max(sum by(instance) (rate(' - 'apiserver_request_count{code=~"5.."}[5m])) / ' - 'sum by(instance) (rate(apiserver_request_count' - '[5m]))) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ] - ), - ], - ), - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - Graph( - title='API Server Request Latency', - id=7, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - lineWidth=1, - nullPointMode='null', - tooltip=Tooltip( - msResolution=False, valueType='individual', - ), - spaceLength=10, - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(verb) (rate(apiserver_latency_' - 'seconds:quantile[5m]) >= 0)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 30, - } - ], - ), - ], - ), - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - Graph( - title='End to End Scheduling Latency', - id=5, - dataSource='prometheus', - isNew=False, - editable=False, - dashLength=10, - lineWidth=1, - nullPointMode="null", - spaceLength=10, - span=6, - dashes=False, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='dtdurations', min=None), - ), - targets=[ - { - 'expr': 'cluster:scheduler_e2e_scheduling_' - 'latency_seconds:quantile', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - } - ], - ), - Graph( - title='API Server Request Rates', - id=6, - dataSource='prometheus', - isNew=False, - editable=False, - dashLength=10, - lineWidth=1, - nullPointMode="null", - spaceLength=10, - span=6, - dashes=False, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(instance) (rate(apiserver_' - 'request_count{code!~\"2..\"}[5m]))', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Error Rate', - 'refId': 'A', - 'step': 60, - }, - { - 'expr': 'sum by(instance) (rate(apiserver_' - 'request_count[5m]))', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Request Rate', - 'refId': 'B', - 'step': 60, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-resource-requests.dashboard.py b/assets/grafana/kubernetes-resource-requests.dashboard.py deleted file mode 100644 index 5d5b3bd8..00000000 --- a/assets/grafana/kubernetes-resource-requests.dashboard.py +++ /dev/null @@ -1,205 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Resource Requests', - version=2, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-3h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - height=300, title='CPU Cores', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='CPU Cores', - description='This represents the total [CPU resource ' - 'requests](https://kubernetes.io/docs/concepts/configu' - 'ration/manage-compute-resources-container/#meaning-of-' - 'cpu) in the cluster.\nFor comparison the total ' - '[allocatable CPU cores](https://github.com/kubernetes/' - 'community/blob/master/contributors/design-proposals/' - 'node-allocatable.md) is also shown.', - id=1, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - lineWidth=1, - spaceLength=10, - nullPointMode='null', - span=9, - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='short', label='CPU Cores', min=None,), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'min(sum(kube_node_status_allocatable_' - 'cpu_cores) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Allocatable CPU Cores', - 'refId': 'A', - 'step': 20, - }, - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_cpu_cores) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Requested CPU Cores', - 'refId': 'B', - 'step': 20, - }, - ], - ), - SingleStat( - title='CPU Cores', - dataSource='prometheus', - id=2, - format='percent', - editable=False, - span=3, - gauge=Gauge(show=True), - sparkline=SparkLine(show=True), - valueFontSize='110%', - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_cpu_cores) by (instance)) / min(sum' - '(kube_node_status_allocatable_cpu_cores) by ' - '(instance)) * 100', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 240, - }, - ], - ), - ], - ), - Row( - height=300, title='Memory', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory', - id=3, - dataSource='prometheus', - description='This represents the total [memory resource ' - 'requests](https://kubernetes.io/docs/concepts/' - 'configuration/manage-compute-resources-container/' - '#meaning-of-memory) in the cluster.\nFor comparison ' - 'the total [allocatable memory](https://github.com/' - 'kubernetes/community/blob/master/contributors/' - 'design-proposals/node-allocatable.md) is also shown.', - dashLength=10, - dashes=False, - lineWidth=1, - isNew=False, - editable=False, - spaceLength=10, - span=9, - nullPointMode='null', - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='bytes', label='Memory', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'min(sum(kube_node_status_allocatable_' - 'memory_bytes) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Allocatable Memory', - 'refId': 'A', - 'step': 20, - }, - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_memory_bytes) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Requested Memory', - 'refId': 'B', - 'step': 20, - }, - ], - ), - SingleStat( - title='Memory', - dataSource='prometheus', - id=4, - format='percent', - span=3, - gauge=Gauge(show=True), - sparkline=SparkLine(show=True), - editable=False, - valueFontSize='110%', - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_memory_bytes) by (instance)) / ' - 'min(sum(kube_node_status_allocatable_memory_' - 'bytes) by (instance)) * 100', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 240, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/nodes.dashboard.py b/assets/grafana/nodes.dashboard.py deleted file mode 100644 index da7b7d24..00000000 --- a/assets/grafana/nodes.dashboard.py +++ /dev/null @@ -1,423 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Nodes', - version=2, - description='Dashboard to get an overview of one server', - gnetId=22, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-1h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - templating=Templating(list=[ - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': None, - 'multi': False, - 'name': 'server', - 'options': [], - 'query': 'label_values(node_boot_time, instance)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Idle CPU', - dataSource='prometheus', - id=3, - isNew=False, - editable=False, - spaceLength=10, - span=6, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis( - format='percent', - label='cpu usage', - max=100, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': '100 - (avg by (cpu) (irate(node_cpu' - '{mode=\"idle\", instance=\"$server\"}[5m])) ' - '* 100)', - 'hide': False, - 'intervalFactor': 10, - 'legendFormat': '{{cpu}}', - 'refId': 'A', - 'step': 50, - } - ], - ), - Graph( - title='System Load', - dataSource='prometheus', - id=9, - isNew=False, - editable=False, - spaceLength=10, - span=6, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percentunit', min=None,), - YAxis(format='short', min=None,), - ), - targets=[ - { - 'expr': 'node_load1{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 1m', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'node_load5{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 5m', - 'refId': 'B', - 'step': 20, - 'target': '', - }, - { - 'expr': 'node_load15{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 15m', - 'refId': 'C', - 'step': 20, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - dataSource='prometheus', - id=4, - isNew=False, - editable=False, - spaceLength=10, - span=9, - stack=True, - dashLength=10, - dashes=False, - tooltip=Tooltip( - msResolution=False, valueType='individual', - ), - seriesOverrides=[ - { - 'alias': 'node_memory_SwapFree{instance=' - '\"172.17.0.1:9100\",job=\"prometheus\"}', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min='0',), - YAxis(format='short', min=None,), - ), - targets=[ - { - 'expr': 'node_memory_MemTotal{instance=' - '\"$server\"} - node_memory_MemFree{instance=' - '\"$server\"} - node_memory_Buffers{instance=' - '\"$server\"} - node_memory_Cached{instance=' - '\"$server\"}', - 'hide': False, - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory used', - 'metric': '', - 'refId': 'C', - 'step': 10, - }, - { - 'expr': 'node_memory_Buffers{instance=' - '\"$server\"}', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory buffers', - 'metric': '', - 'refId': 'E', - 'step': 10, - }, - { - 'expr': 'node_memory_Cached{instance=\"$server\"}', - 'intervalFactor': 2, - 'legendFormat': 'memory cached', - 'metric': '', - 'refId': 'F', - 'step': 10, - }, - { - 'expr': 'node_memory_MemFree{instance=' - '\"$server\"}', - 'intervalFactor': 2, - 'legendFormat': 'memory free', - 'metric': '', - 'refId': 'D', - 'step': 10, - }, - ], - ), - SingleStat( - title='Memory Usage', - dataSource='prometheus', - id=5, - format='percent', - gauge=Gauge(show=True), - editable=False, - span=3, - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - } - ], - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '((node_memory_MemTotal{instance=' - '\"$server\"} - node_memory_MemFree{instance=' - '\"$server\"} - node_memory_Buffers{instance=' - '\"$server\"} - node_memory_Cached{instance=' - '\"$server\"}) / node_memory_MemTotal{instance=' - '\"$server\"}) * 100', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, titleSize='h6', title='New Row', editable=False, - showTitle=False, panels=[ - Graph( - title='Disk I/O', - dataSource='prometheus', - id=6, - dashLength=10, - dashes=False, - editable=False, - spaceLength=10, - span=9, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis( - format='bytes', - min=None, - ), - YAxis( - format='ms', - min=None, - ), - ), - seriesOverrides=[ - { - 'alias': 'read', - 'yaxis': 1, - }, - { - 'alias': '{instance=\"172.17.0.1:9100\"}', - 'yaxis': 2, - }, - { - 'alias': 'io time', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'sum by (instance) (rate(node_disk_' - 'bytes_read{instance=\"$server\"}[2m]))', - 'hide': False, - 'intervalFactor': 4, - 'legendFormat': 'read', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'sum by (instance) (rate(node_disk_' - 'bytes_written{instance=\"$server\"}[2m]))', - 'intervalFactor': 4, - 'legendFormat': 'written', - 'refId': 'B', - 'step': 20 - }, - { - 'expr': 'sum by (instance) (rate(node_disk_io_' - 'time_ms{instance=\"$server\"}[2m]))', - 'intervalFactor': 4, - 'legendFormat': 'io time', - 'refId': 'C', - 'step': 20, - }, - ], - ), - SingleStat( - title='Disk Space Usage', - dataSource='prometheus', - id=7, - thresholds='0.75, 0.9', - editable=False, - valueName='current', - format='percentunit', - span=3, - gauge=Gauge( - maxValue=1, - show=True, - ), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '\"rootfs\",instance=\"$server\"}) - ' - 'sum(node_filesystem_free{device!=\"rootfs\",' - 'instance=\"$server\"})) / sum(node_filesystem_' - 'size{device!=\"rootfs\",instance=\"$server\"})', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', titleSize='h6', - showTitle=False, editable=False, - panels=[ - Graph( - title='Network Received', - dataSource='prometheus', - id=8, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'rate(node_network_receive_bytes{' - 'instance=\"$server\",device!~\"lo\"}[5m])', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '{{device}}', - 'refId': 'A', - 'step': 10, - 'target': '' - } - ], - ), - Graph( - title='Network Transmitted', - dataSource='prometheus', - id=10, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'rate(node_network_transmit_bytes' - '{instance=\"$server\",device!~\"lo\"}[5m])', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '{{device}}', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py deleted file mode 100644 index 84b3fdef..00000000 --- a/assets/grafana/pods.dashboard.py +++ /dev/null @@ -1,255 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Pods', - version=1, - graphTooltip=1, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-6h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': True, - 'label': 'Namespace', - 'multi': False, - 'name': 'namespace', - 'options': [], - 'query': 'label_values(kube_pod_info, namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Pod', - 'multi': False, - 'name': 'pod', - 'options': [], - 'query': 'label_values(kube_pod_info{namespace=~"$namespace"}, ' - 'pod)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': True, - 'label': 'Container', - 'multi': False, - 'name': 'container', - 'options': [], - 'query': 'label_values(kube_pod_container_info{namespace=' - '"$namespace", pod="$pod"}, container)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - dataSource='prometheus', - id=1, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - yAxes=YAxes( - YAxis( - format='bytes', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(container_name) (container_' - 'memory_usage_bytes{pod_name="$pod", ' - 'container_name=~"$container", ' - 'container_name!="POD"})', - 'interval': '10s', - 'intervalFactor': 1, - 'legendFormat': 'Current: {{ container_name }}', - 'metric': 'container_memory_usage_bytes', - 'refId': 'A', - 'step': 15, - }, - { - 'expr': 'kube_pod_container_resource_requests_' - 'memory_bytes{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Requested: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'requests_memory_bytes', - 'refId': 'B', - 'step': 20, - }, - { - 'expr': 'kube_pod_container_resource_limits_' - 'memory_bytes{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Limit: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'limits_memory_bytes', - 'refId': 'C', - 'step': 20, - }, - ], - ), - ], - ), - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='CPU Usage', - dataSource='prometheus', - id=2, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - yAxes=YAxes( - YAxis( - format='short', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by (container_name)(' - 'rate(container_cpu_usage_seconds_total' - '{image!="",container_name!="POD",pod_name=' - '"$pod"}[1m]))', - 'intervalFactor': 2, - 'legendFormat': '{{ container_name }}', - 'refId': 'A', - 'step': 30 - }, - { - 'expr': 'kube_pod_container_resource_requests_' - 'cpu_cores{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Requested: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'requests_cpu_cores', - 'refId': 'B', - 'step': 20, - }, - { - 'expr': 'kube_pod_container_resource_limits_' - 'cpu_cores{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Limit: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'limits_memory_bytes', - 'refId': 'C', - 'step': 20, - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Network I/O', - dataSource='prometheus', - id=3, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - yAxes=YAxes( - YAxis( - format='bytes', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sort_desc(sum by (pod_name) (rate' - '(container_network_receive_bytes_total{' - 'pod_name="$pod"}[1m])))', - 'intervalFactor': 2, - 'legendFormat': '{{ pod_name }}', - 'refId': 'A', - 'step': 30 - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/prometheus-datasource.json b/assets/grafana/prometheus-datasource.json deleted file mode 100644 index 47b8f1b2..00000000 --- a/assets/grafana/prometheus-datasource.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "access": "proxy", - "basicAuth": false, - "name": "prometheus", - "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090" -} diff --git a/assets/grafana/raw-json-dashboards/etcd-dashboard.json b/assets/grafana/raw-json-dashboards/etcd-dashboard.json deleted file mode 100644 index f2a03cec..00000000 --- a/assets/grafana/raw-json-dashboards/etcd-dashboard.json +++ /dev/null @@ -1,1158 +0,0 @@ -{ - "__inputs": [ - { - "name": "prometheus", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.5.2" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "etcd sample Grafana dashboard with Prometheus", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 28, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(etcd_server_has_leader)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "etcd_server_has_leader", - "refId": "A", - "step": 20 - } - ], - "thresholds": "", - "title": "Up", - "type": "singlestat", - "valueFontSize": "200%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Rate", - "metric": "grpc_server_started_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Failed Rate", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 41, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Watch Streams", - "metric": "grpc_server_handled_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Lease Streams", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Active Streams", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB Size", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "DB Size", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 1, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} WAL fsync", - "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB fsync", - "metric": "etcd_disk_backend_commit_duration_seconds_bucket", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Sync Duration", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 29, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Resident Memory", - "metric": "process_resident_memory_bytes", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic In", - "metric": "etcd_network_client_grpc_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic Out", - "metric": "etcd_network_client_grpc_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic In", - "metric": "etcd_network_peer_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic Out", - "metric": "etcd_network_peer_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 40, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Failure Rate", - "metric": "etcd_server_proposals_failed_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(etcd_server_proposals_pending)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Pending Total", - "metric": "etcd_server_proposals_pending", - "refId": "B", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Commit Rate", - "metric": "etcd_server_proposals_committed_total", - "refId": "C", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Apply Rate", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Raft Proposals", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": 0, - "editable": false, - "error": false, - "fill": 0, - "id": 19, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "changes(etcd_server_leader_changes_seen_total[1d])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Total Leader Elections Per Day", - "metric": "etcd_server_leader_changes_seen_total", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total Leader Elections Per Day", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "etcd", - "version": 4 -} \ No newline at end of file diff --git a/assets/grafana/statefulset.dashboard.py b/assets/grafana/statefulset.dashboard.py deleted file mode 100644 index 780630a2..00000000 --- a/assets/grafana/statefulset.dashboard.py +++ /dev/null @@ -1,440 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='StatefulSet', - version=1, - graphTooltip=1, - time=Time(start='now-6h'), - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Namespace', - 'multi': False, - 'name': 'statefulset_namespace', - 'options': [], - 'query': 'label_values(kube_statefulset_metadata_generation, ' - 'namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': None, - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'StatefulSet', - 'multi': False, - 'name': 'statefulset_name', - 'options': [], - 'query': 'label_values(kube_statefulset_metadata_generation' - '{namespace="$statefulset_namespace"}, statefulset)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': 'statefulset', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row(panels=[ - SingleStat( - title='CPU', - id=8, - gauge=Gauge(show=False), - postfix='cores', - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - targets=[ - { - 'expr': 'sum(rate(container_cpu_usage_seconds_total' - '{namespace=\"$statefulset_namespace\",pod_name=~\"' - '$statefulset_name.*\"}[3m]))', - }, - ], - ), - SingleStat( - title='Memory', - id=9, - postfix='GB', - prefixFontSize='80%', - gauge=Gauge(show=False), - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(container_memory_usage_bytes{namespace=' - '\"$statefulset_namespace\",pod_name=~\"$' - 'statefulset_name.*\"}) / 1024^3', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Network', - format='Bps', - gauge=Gauge(thresholdMarkers=False), - id=7, - postfix='', - span=4, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(rate(container_network_transmit_' - 'bytes_total' - '{namespace=\"$statefulset_namespace\",pod_name=~\"' - '$statefulset_name.*\"}[3m])) + ' - 'sum(rate(container_network_receive_bytes_total' - '{namespace=\"$statefulset_namespace\",pod_name=~' - '\"$statefulset_name.*\"}[3m]))', - }, - ], - ), - ], - height=200, - ), - Row( - height=100, panels=[ - SingleStat( - title='Desired Replicas', - id=5, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - span=3, - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'metric': 'kube_statefulset_replicas', - 'expr': 'max(kube_statefulset_replicas' - '{statefulset="$statefulset_name",namespace=' - '"$statefulset_namespace"}) without ' - '(instance, pod)', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - gauge=Gauge(thresholdMarkers=False, show=False), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - ), - SingleStat( - title='Available Replicas', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=6, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'min(kube_statefulset_status_replicas' - '{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Observed Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(), - id=3, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_statefulset_status_observed_' - 'generation{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': "null", - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Metadata Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=2, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_statefulset_metadata_generation' - '{statefulset=\"$statefulset_name\",namespace=\"' - '$statefulset_namespace\"}) without (instance, ' - 'pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ], - ), - Row( - height=350, panels=[ - Graph( - title='Replicas', - dashLength=10, - dashes=False, - id=1, - spaceLength=10, - targets=[ - { - 'expr': 'min(kube_statefulset_status_replicas' - '{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'available', - 'refId': 'B', - 'step': 30, - }, - { - 'expr': 'max(kube_statefulset_replicas' - '{statefulset=\"$statefulset_name\",namespace=\"' - '$statefulset_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'desired', - 'refId': 'E', - 'step': 30, - } - ], - xAxis=XAxis(mode='time'), - yAxes=YAxes( - YAxis(min=None), - YAxis(format='short', min=None, show=False), - ), - ), - ] - ), - ], -) From 64db049d3a886f39816b2778ed305956c9c424dd Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 28 May 2018 16:54:39 +0200 Subject: [PATCH 271/638] kube-prometheus: Migrate kube-prometheus alerts to jsonnet --- .../prometheus/rules/alertmanager.rules.yaml | 33 ---- assets/prometheus/rules/general.rules.yaml | 39 ----- assets/prometheus/rules/node.rules.yaml | 47 ----- assets/prometheus/rules/prometheus.rules.yaml | 101 ----------- .../alerts/alertmanager.libsonnet | 53 ++++++ .../kube-prometheus/alerts/alerts.libsonnet | 4 + .../kube-prometheus/alerts/general.libsonnet | 34 ++++ jsonnet/kube-prometheus/alerts/node.libsonnet | 39 +++++ .../alerts/prometheus.libsonnet | 151 ++++++++++++++++ .../kube-prometheus/kube-prometheus.libsonnet | 28 ++- jsonnet/kube-prometheus/rules/rules.libsonnet | 39 +++++ manifests/grafana-dashboardDefinitions.yaml | 26 +-- manifests/prometheus-rules.yaml | 161 +++++++++++++++--- 13 files changed, 497 insertions(+), 258 deletions(-) delete mode 100644 assets/prometheus/rules/alertmanager.rules.yaml delete mode 100644 assets/prometheus/rules/general.rules.yaml delete mode 100644 assets/prometheus/rules/node.rules.yaml delete mode 100644 assets/prometheus/rules/prometheus.rules.yaml create mode 100644 jsonnet/kube-prometheus/alerts/alertmanager.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/alerts.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/general.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/node.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/prometheus.libsonnet create mode 100644 jsonnet/kube-prometheus/rules/rules.libsonnet diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml deleted file mode 100644 index 5e51f75b..00000000 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ /dev/null @@ -1,33 +0,0 @@ -groups: -- name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) - GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", - "alertmanager-$1", "alertmanager", "(.*)") != 1 - for: 5m - labels: - severity: critical - annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. - summary: Configuration out of sync - - alert: AlertmanagerDownOrMissing - expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", - "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 - for: 5m - labels: - severity: warning - annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. - summary: Alertmanager down or missing - - alert: AlertmanagerFailedReload - expr: alertmanager_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Alertmanager's configuration reload failed diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml deleted file mode 100644 index 84ce6b47..00000000 --- a/assets/prometheus/rules/general.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groups: -- name: general.rules - rules: - - alert: TargetDown - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' - summary: Targets are down - - alert: DeadMansSwitch - expr: vector(1) - labels: - severity: none - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - - record: fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[10m], 3600) > 1 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml deleted file mode 100644 index e678ca84..00000000 --- a/assets/prometheus/rules/node.rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -groups: -- name: node.rules - rules: - - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) - BY (instance) - - record: instance:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - BY (instance) - - record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - - record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) - GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - - alert: NodeExporterDown - expr: absent(up{job="node-exporter"} == 1) - for: 10m - labels: - severity: warning - annotations: - description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery - summary: Prometheus could not scrape a node-exporter - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 - for: 30m - labels: - severity: warning - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 - for: 10m - labels: - severity: critical - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml deleted file mode 100644 index da699c32..00000000 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ /dev/null @@ -1,101 +0,0 @@ -groups: -- name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - expr: prometheus_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - summary: Reloading Promehteus' configuration failed - - - alert: PrometheusNotificationQueueRunningFull - expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity - for: 10m - labels: - severity: warning - annotations: - description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ - $labels.pod}} - summary: Prometheus' alert notification queue is running full - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alert from Prometheus - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.03 - for: 10m - labels: - severity: critical - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alerts from Prometheus - - - alert: PrometheusNotConnectedToAlertmanagers - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 10m - labels: - severity: warning - annotations: - description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected - to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - - - alert: PrometheusTSDBReloadsFailing - expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - - - alert: PrometheusTSDBCompactionsFailing - expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - - - alert: PrometheusTSDBWALCorruptions - expr: tsdb_wal_corruptions_total > 0 - for: 4h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead - log (WAL).' - summary: Prometheus write-ahead log is corrupted - - - alert: PrometheusNotIngestingSamples - expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 - for: 10m - labels: - severity: warning - annotations: - description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." - summary: "Prometheus isn't ingesting samples" - - - alert: PrometheusTargetScapesDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 10m - labels: - severity: warning - annotations: - description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" - summary: Prometheus has many samples rejected diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet new file mode 100644 index 00000000..d283cc18 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -0,0 +1,53 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'alertmanager.rules', + rules: [ + { + alert: 'AlertmanagerConfigInconsistent', + annotations: { + description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', + summary: 'Configuration out of sync', + }, + expr: ||| + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'AlertmanagerDownOrMissing', + annotations: { + description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.', + summary: 'Alertmanager down or missing', + }, + expr: ||| + label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'AlertmanagerFailedReload', + annotations: { + description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", + summary: "Alertmanager's configuration reload failed", + }, + expr: ||| + alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet new file mode 100644 index 00000000..19568a24 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -0,0 +1,4 @@ +(import 'alertmanager.libsonnet') + +(import 'general.libsonnet') + +(import 'node.libsonnet') + +(import 'prometheus.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet new file mode 100644 index 00000000..6f3e4534 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -0,0 +1,34 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'general.rules', + rules: [ + { + alert: 'TargetDown', + annotations: { + description: '{{ $value }}% of {{ $labels.job }} targets are down.', + summary: 'Targets are down', + }, + expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10', + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'DeadMansSwitch', + annotations: { + description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.', + summary: 'Alerting DeadMansSwitch', + }, + expr: 'vector(1)', + labels: { + severity: 'none', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet new file mode 100644 index 00000000..f5387a99 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'kube-prometheus-node-alerting.rules', + rules: [ + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 24 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0 + ||| % $._config, + 'for': '30m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 2 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet new file mode 100644 index 00000000..32d8262b --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -0,0 +1,151 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus.rules', + rules: [ + { + alert: 'PrometheusConfigReloadFailed', + annotations: { + description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}", + summary: "Reloading Promehteus' configuration failed", + }, + expr: ||| + prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotificationQueueRunningFull', + annotations: { + description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + summary: "Prometheus' alert notification queue is running full", + }, + expr: ||| + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alert from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alerts from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + annotations: { + description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', + summary: 'Prometheus is not connected to any Alertmanagers', + }, + expr: ||| + prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBReloadsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', + summary: 'Prometheus has issues reloading data blocks from disk', + }, + expr: ||| + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBCompactionsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + summary: 'Prometheus has issues compacting sample blocks', + }, + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + summary: 'Prometheus write-ahead log is corrupted', + }, + expr: ||| + tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + annotations: { + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + summary: "Prometheus isn't ingesting samples", + }, + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTargetScapesDuplicate', + annotations: { + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + summary: 'Prometheus has many samples rejected', + }, + expr: ||| + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index e79b7567..6c1636de 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -6,7 +6,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; (import 'alertmanager/alertmanager.libsonnet') + (import 'prometheus-operator/prometheus-operator.libsonnet') + (import 'prometheus/prometheus.libsonnet') + -(import 'kubernetes-mixin/mixin.libsonnet') + { +(import 'kubernetes-mixin/mixin.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'rules/rules.libsonnet') + { kubePrometheus+:: { namespace: k.core.v1.namespace.new($._config.namespace), }, @@ -14,11 +16,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', - kubeStateMetricsSelector: 'job="kube-state-metrics"', cadvisorSelector: 'job="kubelet"', - nodeExporterSelector: 'job="node-exporter"', kubeletSelector: 'job="kubelet"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', notKubeDnsSelector: 'job!="kube-dns"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + + alertmanagerSelector: 'job="alertmanager-main"', + prometheusSelector: 'job="prometheus-k8s"', + prometheusOperatorSelector: 'job="prometheus-operator"', + + jobs: { + Kubelet: $._config.kubeletSelector, + KubeScheduler: $._config.kubeSchedulerSelector, + KubeControllerManager: $._config.kubeControllerManagerSelector, + KubeAPI: $._config.kubeApiserverSelector, + KubeStateMetrics: $._config.kubeStateMetricsSelector, + NodeExporter: $._config.nodeExporterSelector, + Alertmanager: $._config.alertmanagerSelector, + Prometheus: $._config.prometheusSelector, + PrometheusOperator: $._config.prometheusOperatorSelector, + }, prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/rules/rules.libsonnet new file mode 100644 index 00000000..ec3a331e --- /dev/null +++ b/jsonnet/kube-prometheus/rules/rules.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'kube-prometheus-node-recording.rules', + rules: [ + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + record: 'instance:node_cpu:rate:sum', + }, + { + expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', + record: 'instance:node_filesystem_usage:sum', + }, + { + expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', + record: 'instance:node_network_receive_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', + record: 'instance:node_network_transmit_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', + record: 'instance:node_cpu:ratio', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', + record: 'cluster:node_cpu:sum_rate5m', + }, + { + expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', + record: 'cluster:node_cpu:ratio', + }, + ], + }, + ], + }, +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af7e2749..f4058562 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3868,7 +3868,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)", + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4097,7 +4097,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4228,7 +4228,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4457,7 +4457,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -5206,7 +5206,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -6066,7 +6066,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6145,7 +6145,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6224,7 +6224,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6317,7 +6317,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6397,7 +6397,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6477,7 +6477,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6557,7 +6557,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 35aaa927..d916ff29 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -49,13 +49,13 @@ data: without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) without(instance, + sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": - |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) @@ -122,20 +122,49 @@ data: by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-absent\"\n - \ \"rules\": \n - \"alert\": \"KubeAPIDown\"\n \"annotations\": \n \"message\": - \"KubeAPI has disappeared from Prometheus target discovery.\"\n \"expr\": |\n - \ absent(up{job=\"kube-apiserver\"} == 1)\n \"for\": \"15m\"\n \"labels\": - \n \"severity\": \"critical\"\n - \"alert\": \"KubeControllerManagerDown\"\n - \ \"annotations\": \n \"message\": \"KubeControllerManager has disappeared - from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-controller-manager\"} + \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n + \ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m])) + BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\": + \"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"})) + BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\": + \"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n + \ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\": + \"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m])) + WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, + cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\": + \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\": + \"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu) + BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\": + \"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\": + \n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n + \ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus + target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\": + \"KubeControllerManager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n + \ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeSchedulerDown\"\n \"annotations\": \n \"message\": - \"KubeScheduler has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": + \ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\": + \"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n + \ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n + \ \"annotations\": \n \"message\": \"NodeExporter has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus + has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\": + \"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container @@ -239,28 +268,116 @@ data: 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) + \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by - (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) - < 86400\n \"labels\": \n \"severity\": \"critical\"" + (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) + < 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n + \ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\": + \n \"description\": \"The configuration of the instances of the Alertmanager + cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration + out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"}) + BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\": + \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n + \ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers + are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\": + \"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT() + sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\": + \n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n + \ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration + has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\": + \"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- + \"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\": + \n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n + \ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0) + BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n + \ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\": + \n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire + Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n + \ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n- + \"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\": + \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device + {{$labels.device}} on node {{$labels.instance}} is running full within the next + 24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk + is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h], + 3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": + \"device {{$labels.device}} on node {{$labels.instance}} is running full within + the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node + disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m], + 3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n- + \"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n + \ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration + has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading + Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\": + \n \"description\": \"Prometheus' alert notification queue is running full + for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus' + alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m], + 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\": + \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n + \ \"annotations\": \n \"description\": \"Errors while sending alerts from + Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n + \ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\": + |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": + \"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\": + \"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} + to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while + sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n + \ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is + not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected + to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"} + < 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures + over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading + data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction + failures over the last four hours.\"\n \"summary\": \"Prometheus has issues + compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n + \ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\": + |\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\": + \"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n + \ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace + }}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus + isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m]) + <= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\": + \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate + timestamps but different values\"\n \"summary\": \"Prometheus has many samples + rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m]) + > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"" kind: ConfigMap metadata: labels: From 0bb4be801cc8d4d793b33e216fc7a3f268b0aa0d Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 29 May 2018 11:29:11 +0200 Subject: [PATCH 272/638] Remove etcd alerts These alerts will be vendored in the future through jsonnet-bundler directly from the etcd repository. --- assets/prometheus/rules/etcd3.rules.yaml | 123 ----------------------- 1 file changed, 123 deletions(-) delete mode 100644 assets/prometheus/rules/etcd3.rules.yaml diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml deleted file mode 100644 index a16bf016..00000000 --- a/assets/prometheus/rules/etcd3.rules.yaml +++ /dev/null @@ -1,123 +0,0 @@ -groups: -- name: ./etcd3.rules - rules: - - alert: InsufficientMembers - expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - for: 3m - labels: - severity: critical - annotations: - description: If one more etcd member goes down the cluster will be unavailable - summary: etcd cluster insufficient members - - alert: NoLeader - expr: etcd_server_has_leader{job="etcd"} == 0 - for: 1m - labels: - severity: critical - annotations: - description: etcd member {{ $labels.instance }} has no leader - summary: etcd member has no leader - - alert: HighNumberOfLeaderChanges - expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader - changes within the last hour - summary: a high number of leader changes within the etcd cluster are happening - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical - annotations: - description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method - }} are slow - summary: slow gRPC requests - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HTTPRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow - summary: slow HTTP requests - - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} member communication with - {{ $labels.To }} is slow - summary: etcd member communication is slow - - alert: HighNumberOfFailedProposals - expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal - failures within the last hour - summary: a high number of proposals within the etcd cluster are failing - - alert: HighFsyncDurations - expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) - > 0.5 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} fync durations are high - summary: high fsync durations - - alert: HighCommitDurations - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) - > 0.25 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} commit durations are high - summary: high commit durations From 2093b981a002020a5a03e54c3ca4f3a4f4a144c6 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson <5434736+roffe@users.noreply.github.com> Date: Tue, 29 May 2018 11:40:50 +0200 Subject: [PATCH 273/638] Mac compat Make certgen work on OSX --- experimental/custom-metrics-api/gencerts.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/experimental/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh index 312ce74c..1c2a2dc5 100755 --- a/experimental/custom-metrics-api/gencerts.sh +++ b/experimental/custom-metrics-api/gencerts.sh @@ -1,5 +1,14 @@ #!/usr/bin/env bash +# Detect if we are on mac or should use GNU base64 options +case `uname` in + Darwin) + b64_opts='-b=0' + ;; + *) + b64_opts='--width=0' +esac + go get -v -u github.com/cloudflare/cfssl/cmd/... export PURPOSE=metrics @@ -16,6 +25,6 @@ kind: Secret metadata: name: cm-adapter-serving-certs data: - serving.crt: $(cat apiserver.pem | base64 --wrap=0) - serving.key: $(cat apiserver-key.pem | base64 --wrap=0) + serving.crt: $(cat apiserver.pem | base64 ${b64_opts}) + serving.key: $(cat apiserver-key.pem | base64 ${b64_opts}) EOF From 62fff622e9900fade8aecbd02bc9c557b736ef85 Mon Sep 17 00:00:00 2001 From: Joakim Karlsson <5434736+roffe@users.noreply.github.com> Date: Tue, 29 May 2018 11:42:20 +0200 Subject: [PATCH 274/638] Update gencerts.sh --- experimental/custom-metrics-api/gencerts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh index 1c2a2dc5..7cd8af93 100755 --- a/experimental/custom-metrics-api/gencerts.sh +++ b/experimental/custom-metrics-api/gencerts.sh @@ -6,7 +6,7 @@ case `uname` in b64_opts='-b=0' ;; *) - b64_opts='--width=0' + b64_opts='--wrap=0' esac go get -v -u github.com/cloudflare/cfssl/cmd/... From 7eee585cdf31bf3878ab059f1d457e2f6e89d07e Mon Sep 17 00:00:00 2001 From: Nicholas Whitehead Date: Fri, 25 May 2018 18:21:14 -0400 Subject: [PATCH 275/638] Adds proxy_url to v1.Endpoint. Fixes #1385 --- .../servicemonitor-crd.libsonnet | 2 +- ...ervicemonitorCustomResourceDefinition.yaml | 4 +++ manifests/grafana-dashboardDefinitions.yaml | 26 +++++++++---------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet index 1df3123f..94a82be0 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"servicemonitors.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"ServiceMonitor","plural":"servicemonitors"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"ServiceMonitor defines monitoring for a set of services.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"ServiceMonitorSpec contains specification parameters for a ServiceMonitor.","properties":{"endpoints":{"description":"A list of endpoints allowed as part of this ServiceMonitor.","items":{"description":"Endpoint defines a scrapeable endpoint serving Prometheus metrics.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerTokenFile":{"description":"File to read bearer token for scraping targets.","type":"string"},"honorLabels":{"description":"HonorLabels chooses the metric's labels on collisions with target labels.","type":"boolean"},"interval":{"description":"Interval at which metrics should be scraped","type":"string"},"metricRelabelings":{"description":"MetricRelabelConfigs to apply to samples before ingestion.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"},"params":{"description":"Optional HTTP URL parameters","type":"object"},"path":{"description":"HTTP path to scrape for metrics.","type":"string"},"port":{"description":"Name of the service port this endpoint refers to. Mutually exclusive with targetPort.","type":"string"},"scheme":{"description":"HTTP scheme to use for scraping.","type":"string"},"scrapeTimeout":{"description":"Timeout after which the scrape is ended","type":"string"},"targetPort":{},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}}},"type":"array"},"jobLabel":{"description":"The label to use to retrieve the job name from.","type":"string"},"namespaceSelector":{"description":"A selector for selecting namespaces either selecting all namespaces or a list of namespaces.","properties":{"any":{"description":"Boolean describing whether all namespaces are selected in contrast to a list restricting them.","type":"boolean"},"matchNames":{"description":"List of namespace names.","items":{"type":"string"},"type":"array"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"targetLabels":{"description":"TargetLabels transfers labels on the Kubernetes Service onto the target.","items":{"type":"string"},"type":"array"}},"required":["endpoints","selector"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"servicemonitors.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"ServiceMonitor","plural":"servicemonitors"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"ServiceMonitor defines monitoring for a set of services.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"ServiceMonitorSpec contains specification parameters for a ServiceMonitor.","properties":{"endpoints":{"description":"A list of endpoints allowed as part of this ServiceMonitor.","items":{"description":"Endpoint defines a scrapeable endpoint serving Prometheus metrics.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerTokenFile":{"description":"File to read bearer token for scraping targets.","type":"string"},"honorLabels":{"description":"HonorLabels chooses the metric's labels on collisions with target labels.","type":"boolean"},"interval":{"description":"Interval at which metrics should be scraped","type":"string"},"metricRelabelings":{"description":"MetricRelabelConfigs to apply to samples before ingestion.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"},"params":{"description":"Optional HTTP URL parameters","type":"object"},"path":{"description":"HTTP path to scrape for metrics.","type":"string"},"port":{"description":"Name of the service port this endpoint refers to. Mutually exclusive with targetPort.","type":"string"},"proxyUrl":{"description":"ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint.","type":"string"},"scheme":{"description":"HTTP scheme to use for scraping.","type":"string"},"scrapeTimeout":{"description":"Timeout after which the scrape is ended","type":"string"},"targetPort":{},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}}},"type":"array"},"jobLabel":{"description":"The label to use to retrieve the job name from.","type":"string"},"namespaceSelector":{"description":"A selector for selecting namespaces either selecting all namespaces or a list of namespaces.","properties":{"any":{"description":"Boolean describing whether all namespaces are selected in contrast to a list restricting them.","type":"boolean"},"matchNames":{"description":"List of namespace names.","items":{"type":"string"},"type":"array"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"targetLabels":{"description":"TargetLabels transfers labels on the Kubernetes Service onto the target.","items":{"type":"string"},"type":"array"}},"required":["endpoints","selector"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 6573b146..c6bc96a1 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -133,6 +133,10 @@ spec: description: Name of the service port this endpoint refers to. Mutually exclusive with targetPort. type: string + proxyUrl: + description: ProxyURL eg http://proxyserver:2195 Directs scrapes + to proxy through this endpoint. + type: string scheme: description: HTTP scheme to use for scraping. type: string diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af7e2749..f4058562 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3868,7 +3868,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)", + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4097,7 +4097,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4228,7 +4228,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4457,7 +4457,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -5206,7 +5206,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -6066,7 +6066,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6145,7 +6145,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6224,7 +6224,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6317,7 +6317,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6397,7 +6397,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6477,7 +6477,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6557,7 +6557,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, From 453e170f6a8d965278461c3a4b68b5e9895c9016 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 29 May 2018 16:42:38 +0200 Subject: [PATCH 276/638] prometheus: Enable live migration of rule configmaps to rule file crds With this patch the Prometheus Operator checks if there are any Kubernetes ConfigMaps inside the Prometheus namespace which fullfill the `Spec.RuleSelector` requirement. If so, it creates a RuleFile for each key in the `ConfigMap.Spec.Data` map inside the Prometheus namespace. --- 0prometheus-operator-clusterRole.yaml | 71 +++++++++++++++++++ 0prometheus-operator-deployment.yaml | 41 +++++++++++ .../alertmanager-crd.libsonnet | 2 +- .../rulefile-crd.libsonnet | 2 +- ...0alertmanagerCustomResourceDefinition.yaml | 2 +- 5 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 0prometheus-operator-clusterRole.yaml create mode 100644 0prometheus-operator-deployment.yaml diff --git a/0prometheus-operator-clusterRole.yaml b/0prometheus-operator-clusterRole.yaml new file mode 100644 index 00000000..a790429f --- /dev/null +++ b/0prometheus-operator-clusterRole.yaml @@ -0,0 +1,71 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-operator +rules: +- apiGroups: + - extensions + resources: + - thirdpartyresources + verbs: + - '*' +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + - prometheuses/finalizers + - alertmanagers/finalizers + - servicemonitors + - rulefiles + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +- apiGroups: + - "" + resources: + - configmaps + - secrets + verbs: + - '*' +- apiGroups: + - "" + resources: + - pods + verbs: + - list + - delete +- apiGroups: + - "" + resources: + - services + - endpoints + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - list + - watch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - list + - watch diff --git a/0prometheus-operator-deployment.yaml b/0prometheus-operator-deployment.yaml new file mode 100644 index 00000000..b4e0cb4e --- /dev/null +++ b/0prometheus-operator-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + labels: + k8s-app: prometheus-operator + name: prometheus-operator + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + k8s-app: prometheus-operator + template: + metadata: + labels: + k8s-app: prometheus-operator + spec: + containers: + - args: + - --kubelet-service=kube-system/kubelet + - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:96d74644 + - --log-level=all + image: quay.io/coreos/prometheus-operator:96d74644 + name: prometheus-operator + ports: + - containerPort: 8080 + name: http + resources: + limits: + cpu: 200m + memory: 100Mi + requests: + cpu: 100m + memory: 50Mi + nodeSelector: + beta.kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: prometheus-operator diff --git a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet index 604a81f6..4a422463 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Alertmanager describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet index cb8d02fc..96ab6a00 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"rulefiles.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"RuleFile","plural":"rulefiles"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"RuleFile defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"RuleFileSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"format":"int64","type":"integer"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"format":"int64","type":"integer"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"rulefiles.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"RuleFile","plural":"rulefiles"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"RuleFile defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"RuleFileSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"type":"string"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"type":"string"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 081e1d4d..560eb6ef 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -11,7 +11,7 @@ spec: scope: Namespaced validation: openAPIV3Schema: - description: Describes an Alertmanager cluster. + description: Alertmanager describes an Alertmanager cluster. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation From 6f6d7dcdc941d365aadf852be153eac043d81edd Mon Sep 17 00:00:00 2001 From: slene Date: Thu, 31 May 2018 18:55:13 +0800 Subject: [PATCH 277/638] support custom kube-prometheus configmapReloader & prometheusConfigReloader image --- .../prometheus-operator/prometheus-operator.libsonnet | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 32864026..2a81e624 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -6,10 +6,14 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { prometheusOperator: 'v0.19.0', + configmapReloader: 'v0.0.1', + prometheusConfigReloader: 'v0.0.4', }, imageRepos+:: { prometheusOperator: 'quay.io/coreos/prometheus-operator', + configmapReloader: 'quay.io/coreos/configmap-reload', + prometheusConfigReloader: 'quay.io/coreos/prometheus-config-reloader', }, }, @@ -119,7 +123,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local operatorContainer = container.new('prometheus-operator', $._config.imageRepos.prometheusOperator + ':' + $._config.versions.prometheusOperator) + container.withPorts(containerPort.newNamed('http', targetPort)) + - container.withArgs(['--kubelet-service=kube-system/kubelet', '--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1']) + + container.withArgs([ + '--kubelet-service=kube-system/kubelet', + '--config-reloader-image=' + $._config.imageRepos.configmapReloader + ':' + $._config.versions.configmapReloader, + '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusConfigReloader, + ]) + container.mixin.resources.withRequests({ cpu: '100m', memory: '50Mi' }) + container.mixin.resources.withLimits({ cpu: '200m', memory: '100Mi' }); From 49c76ac80fbeb4723db107a05c74db7de37c6659 Mon Sep 17 00:00:00 2001 From: slene Date: Thu, 31 May 2018 18:59:36 +0800 Subject: [PATCH 278/638] update generated kube-prometheus manifests --- manifests/0prometheus-operator-deployment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index b965ec1c..0105de22 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,6 +19,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.0.4 image: quay.io/coreos/prometheus-operator:v0.19.0 name: prometheus-operator ports: From 553d6b0c6365aeb58a839fb3b1023f5dda8a91a7 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 31 May 2018 12:46:46 +0200 Subject: [PATCH 279/638] rbac: Remove Third Party Resources rules Since PR 460 [1] the Prometheus Operator is using Kubernetes Custom Resource Definitions instead of Kubernetes Third Party Resources. Permissions to handle Third Party Resources in the RBAC rules of the Prometheus Operator is thereby obsolete. [1] https://github.com/coreos/prometheus-operator/pull/460 --- .../prometheus-operator/prometheus-operator.libsonnet | 9 +-------- manifests/0prometheus-operator-clusterRole.yaml | 6 ------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 32864026..365553d6 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -33,13 +33,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local clusterRole = k.rbac.v1.clusterRole; local policyRule = clusterRole.rulesType; - local extensionsRule = policyRule.new() + - policyRule.withApiGroups(['extensions']) + - policyRule.withResources([ - 'thirdpartyresources', - ]) + - policyRule.withVerbs(['*']); - local apiExtensionsRule = policyRule.new() + policyRule.withApiGroups(['apiextensions.k8s.io']) + policyRule.withResources([ @@ -102,7 +95,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; ]) + policyRule.withVerbs(['list', 'watch']); - local rules = [extensionsRule, apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; + local rules = [apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; clusterRole.new() + clusterRole.mixin.metadata.withName('prometheus-operator') + diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index 8c85391f..94f5ce09 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -3,12 +3,6 @@ kind: ClusterRole metadata: name: prometheus-operator rules: -- apiGroups: - - extensions - resources: - - thirdpartyresources - verbs: - - '*' - apiGroups: - apiextensions.k8s.io resources: From 304db758130ac770b84a7f95f6149e8152c33c38 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 31 May 2018 16:09:53 +0200 Subject: [PATCH 280/638] kube-prometheus: Remove duplicate unused files afe9711ce2ef5dd0e0b5091f921ae9bc0aab3711 introduced contrib/kube-prometheus/0prometheus-operator-clusterRole.yaml and contrib/kube-prometheus/0prometheus-operator-deployment.yaml by mistake. --- 0prometheus-operator-clusterRole.yaml | 71 --------------------------- 0prometheus-operator-deployment.yaml | 41 ---------------- 2 files changed, 112 deletions(-) delete mode 100644 0prometheus-operator-clusterRole.yaml delete mode 100644 0prometheus-operator-deployment.yaml diff --git a/0prometheus-operator-clusterRole.yaml b/0prometheus-operator-clusterRole.yaml deleted file mode 100644 index a790429f..00000000 --- a/0prometheus-operator-clusterRole.yaml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus-operator -rules: -- apiGroups: - - extensions - resources: - - thirdpartyresources - verbs: - - '*' -- apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - '*' -- apiGroups: - - monitoring.coreos.com - resources: - - alertmanagers - - prometheuses - - prometheuses/finalizers - - alertmanagers/finalizers - - servicemonitors - - rulefiles - verbs: - - '*' -- apiGroups: - - apps - resources: - - statefulsets - verbs: - - '*' -- apiGroups: - - "" - resources: - - configmaps - - secrets - verbs: - - '*' -- apiGroups: - - "" - resources: - - pods - verbs: - - list - - delete -- apiGroups: - - "" - resources: - - services - - endpoints - verbs: - - get - - create - - update -- apiGroups: - - "" - resources: - - nodes - verbs: - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - verbs: - - list - - watch diff --git a/0prometheus-operator-deployment.yaml b/0prometheus-operator-deployment.yaml deleted file mode 100644 index b4e0cb4e..00000000 --- a/0prometheus-operator-deployment.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: apps/v1beta2 -kind: Deployment -metadata: - labels: - k8s-app: prometheus-operator - name: prometheus-operator - namespace: monitoring -spec: - replicas: 1 - selector: - matchLabels: - k8s-app: prometheus-operator - template: - metadata: - labels: - k8s-app: prometheus-operator - spec: - containers: - - args: - - --kubelet-service=kube-system/kubelet - - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:96d74644 - - --log-level=all - image: quay.io/coreos/prometheus-operator:96d74644 - name: prometheus-operator - ports: - - containerPort: 8080 - name: http - resources: - limits: - cpu: 200m - memory: 100Mi - requests: - cpu: 100m - memory: 50Mi - nodeSelector: - beta.kubernetes.io/os: linux - securityContext: - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: prometheus-operator From b7d0da01d5c79040cf4b550e4828992310a1a366 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 31 May 2018 17:28:37 +0200 Subject: [PATCH 281/638] kube-prometheus: Remove unnecessary namespace selectors --- .../alertmanager/alertmanager.libsonnet | 5 --- .../kube-state-metrics.libsonnet | 5 --- .../node-exporter/node-exporter.libsonnet | 5 --- .../prometheus-operator.libsonnet | 24 ++++++++++++++ .../prometheus/prometheus.libsonnet | 31 +------------------ ... 0prometheus-operator-serviceMonitor.yaml} | 0 manifests/alertmanager-serviceMonitor.yaml | 3 -- .../kube-state-metrics-serviceMonitor.yaml | 3 -- manifests/node-exporter-serviceMonitor.yaml | 3 -- ...us.yaml => prometheus-serviceMonitor.yaml} | 3 -- 10 files changed, 25 insertions(+), 57 deletions(-) rename manifests/{prometheus-serviceMonitorPrometheusOperator.yaml => 0prometheus-operator-serviceMonitor.yaml} (100%) rename manifests/{prometheus-serviceMonitorPrometheus.yaml => prometheus-serviceMonitor.yaml} (82%) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index db370e1d..473f89d3 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -61,11 +61,6 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by alertmanager: $._config.alertmanager.name, }, }, - namespaceSelector: { - matchNames: [ - 'monitoring', - ], - }, endpoints: [ { port: 'web', diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index c73c16ab..5fe1c074 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -254,11 +254,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'k8s-app': 'kube-state-metrics', }, }, - namespaceSelector: { - matchNames: [ - 'monitoring', - ], - }, endpoints: [ { port: 'https-main', diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index d232d920..2d012110 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -134,11 +134,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'k8s-app': 'node-exporter', }, }, - namespaceSelector: { - matchNames: [ - 'monitoring', - ], - }, endpoints: [ { port: 'https', diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 0b991a07..11e9c0c0 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -149,5 +149,29 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.mixin.metadata.withLabels({ 'k8s-app': 'prometheus-operator' }) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.spec.withClusterIp('None'), + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'prometheus-operator', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'prometheus-operator', + }, + }, + spec: { + endpoints: [ + { + port: 'http', + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'prometheus-operator', + }, + }, + }, + }, }, } diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index dbb903a7..09771ebe 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -222,7 +222,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, }, }, - serviceMonitorPrometheus: + serviceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -239,11 +239,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus: $._config.prometheus.name, }, }, - namespaceSelector: { - matchNames: [ - 'monitoring', - ], - }, endpoints: [ { port: 'web', @@ -252,30 +247,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; ], }, }, - serviceMonitorPrometheusOperator: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'prometheus-operator', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'prometheus-operator', - }, - }, - spec: { - endpoints: [ - { - port: 'http', - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'prometheus-operator', - }, - }, - }, - }, serviceMonitorKubeScheduler: { apiVersion: 'monitoring.coreos.com/v1', diff --git a/manifests/prometheus-serviceMonitorPrometheusOperator.yaml b/manifests/0prometheus-operator-serviceMonitor.yaml similarity index 100% rename from manifests/prometheus-serviceMonitorPrometheusOperator.yaml rename to manifests/0prometheus-operator-serviceMonitor.yaml diff --git a/manifests/alertmanager-serviceMonitor.yaml b/manifests/alertmanager-serviceMonitor.yaml index e4e75ccc..548af0d6 100644 --- a/manifests/alertmanager-serviceMonitor.yaml +++ b/manifests/alertmanager-serviceMonitor.yaml @@ -9,9 +9,6 @@ spec: endpoints: - interval: 30s port: web - namespaceSelector: - matchNames: - - monitoring selector: matchLabels: alertmanager: main diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index cca52f69..3d1073ad 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -21,9 +21,6 @@ spec: tlsConfig: insecureSkipVerify: true jobLabel: k8s-app - namespaceSelector: - matchNames: - - monitoring selector: matchLabels: k8s-app: kube-state-metrics diff --git a/manifests/node-exporter-serviceMonitor.yaml b/manifests/node-exporter-serviceMonitor.yaml index 529f2944..273d2748 100644 --- a/manifests/node-exporter-serviceMonitor.yaml +++ b/manifests/node-exporter-serviceMonitor.yaml @@ -14,9 +14,6 @@ spec: tlsConfig: insecureSkipVerify: true jobLabel: k8s-app - namespaceSelector: - matchNames: - - monitoring selector: matchLabels: k8s-app: node-exporter diff --git a/manifests/prometheus-serviceMonitorPrometheus.yaml b/manifests/prometheus-serviceMonitor.yaml similarity index 82% rename from manifests/prometheus-serviceMonitorPrometheus.yaml rename to manifests/prometheus-serviceMonitor.yaml index 90b25476..b7605dbe 100644 --- a/manifests/prometheus-serviceMonitorPrometheus.yaml +++ b/manifests/prometheus-serviceMonitor.yaml @@ -9,9 +9,6 @@ spec: endpoints: - interval: 30s port: web - namespaceSelector: - matchNames: - - monitoring selector: matchLabels: prometheus: k8s From 8f3b505049e1716758ea88acf41f25a836765fff Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 4 Jun 2018 17:13:23 +0200 Subject: [PATCH 282/638] kube-prometheus: Adjust to new Prometheus rule file CRD 89fc4e306972604eba2dcb961a6d29cc27a668ad introduced the new Prometheus rule file custom resource definition. This patch adjusts the kube-prometheus project to use the new custom resource definition. --- .../prometheus-operator.libsonnet | 4 +- .../prometheus/prometheus.libsonnet | 20 +- .../0prometheus-operator-clusterRole.yaml | 1 + .../0prometheus-operator-deployment.yaml | 2 +- manifests/prometheus-rules.yaml | 1171 +++++++++++------ 5 files changed, 809 insertions(+), 389 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 11e9c0c0..7fce1836 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -7,7 +7,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { prometheusOperator: 'v0.19.0', configmapReloader: 'v0.0.1', - prometheusConfigReloader: 'v0.0.4', }, imageRepos+:: { @@ -52,6 +51,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'prometheuses/finalizers', 'alertmanagers/finalizers', 'servicemonitors', + 'rulefiles', ]) + policyRule.withVerbs(['*']); @@ -119,7 +119,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.withArgs([ '--kubelet-service=kube-system/kubelet', '--config-reloader-image=' + $._config.imageRepos.configmapReloader + ':' + $._config.versions.configmapReloader, - '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusConfigReloader, + '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusOperator, ]) + container.mixin.resources.withRequests({ cpu: '100m', memory: '50Mi' }) + container.mixin.resources.withLimits({ cpu: '200m', memory: '100Mi' }); diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 09771ebe..f6503fed 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -40,11 +40,21 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), rules: - local configMap = k.core.v1.configMap; - - configMap.new('prometheus-' + $._config.prometheus.name + '-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + - configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: $._config.prometheus.name }) + - configMap.mixin.metadata.withNamespace($._config.namespace), + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'RuleFile', + metadata: { + labels: { + prometheus: $._config.prometheus.name, + role: 'alert-rules', + }, + name: 'prometheus-' + $._config.prometheus.name + '-rules', + namespace: $._config.namespace, + }, + spec: { + groups: $._config.prometheus.rules.groups, + }, + }, roleBindingDefault: local roleBinding = k.rbac.v1.roleBinding; diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index 94f5ce09..321859ca 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -17,6 +17,7 @@ rules: - prometheuses/finalizers - alertmanagers/finalizers - servicemonitors + - rulefiles verbs: - '*' - apiGroups: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 0105de22..e85bbe1f 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.0.4 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.19.0 image: quay.io/coreos/prometheus-operator:v0.19.0 name: prometheus-operator ports: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index d916ff29..8550d801 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1,387 +1,796 @@ -apiVersion: v1 -data: - all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\": - |\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m])) - by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n - \ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"}) - by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", - image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name) - group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"}) - by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n - \ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", - \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}) - by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} - and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n - \ * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n- - \"name\": \"kube-scheduler.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- - \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, - pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": - \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": - |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- - \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) - by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n - \ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", - \"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n - \ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1 - - avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\": - \":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n - \ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\": - \"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n - \ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n - \ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n - \ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"} - + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n - \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\": - \":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} - + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\": - |\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n - \ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n - \ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\": - \"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n - \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\": - \":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum - by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"} - + node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n - \ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node) - (\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": - \"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum - / node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n - \ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n - \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": - \"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n - \ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\": - |\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n - \ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\": - |\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])) - +\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum - by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]) - +\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\": - |\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])) - +\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum - by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) - +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n - \ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m])) - BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\": - \"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"})) - BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\": - \"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n - \ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\": - \"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m])) - WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, - cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\": - \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\": - \"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu) - BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\": - \"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\": - \n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\": - \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n - \ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus - target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\": - \"KubeControllerManager has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\": - \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n - \ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from - Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\": - \"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\": - \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": - \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n - \ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n - \ \"annotations\": \n \"message\": \"NodeExporter has disappeared from - Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus - has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\": - \"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n - \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n - \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n - \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n - \ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) - > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\": - \"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n - \ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", - phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": - \"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\": - \n \"message\": \"Deployment {{ $labels.namespace }}/{{ $labels.deployment - }} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n - \ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\": - \"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n - \ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n - \ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeStatefulSetReplicasMismatch\"\n \"annotations\": \n \"message\": - \"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch\"\n - \ \"expr\": |\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n - \ !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeStatefulSetGenerationMismatch\"\n \"annotations\": \n \"message\": - \"StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch\"\n - \ \"expr\": |\n kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n - \ !=\n kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeDaemonSetRolloutStuck\"\n \"annotations\": \n \"message\": - \"Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}}\"\n - \ \"expr\": |\n kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n - \ /\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - * 100 < 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeDaemonSetNotScheduled\"\n \"annotations\": \n \"message\": - \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are - not scheduled.\"\n \"expr\": |\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n - \ -\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeDaemonSetMisScheduled\"\n \"annotations\": \n \"message\": - \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are - running where they are not supposed to run.\"\n \"expr\": |\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- - \"name\": \"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n - \ \"annotations\": \n \"message\": \"Overcommited CPU resource requests - on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n - \ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1) - / count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": - \"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n - \ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n - \ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n - \ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": - \n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\": - \n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n - \ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", - resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n > - 1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": - \"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n - \ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n - \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n - \ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": - \"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\" - $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n - \ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n - \ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\", - type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\": - \"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\": - \"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace - {{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\": - |\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n - \ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\": - \"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n - \ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent - volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace - }} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h], - 4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\": - \"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\": - \"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node - }} has been unready for more than an hour\"\n \"expr\": |\n kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} - == 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeVersionMismatch\"\n \"annotations\": \n \"message\": - \"There are {{ $value }} different versions of Kubernetes components running.\"\n - \ \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"}) by - (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": - \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing - {{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m])) - by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m])) - by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": - \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing - {{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m])) - by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeletTooManyPods\"\n \"annotations\": \n \"message\": - \"Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit - of 110.\"\n \"expr\": |\n kubelet_running_pod_count{job=\"kubelet\"} > - 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": - \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} - > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": - \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} - > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) - without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": - \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) - without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": - \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) - < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n - \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring - in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by - (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) - < 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n - \ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\": - \n \"description\": \"The configuration of the instances of the Alertmanager - cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration - out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"}) - BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, - \"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\": - \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n - \ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers - are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\": - \"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, - \"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT() - sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\": - \n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n - \ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration - has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\": - \"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"} - == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- - \"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\": - \n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n - \ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0) - BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n - \ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\": - \n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire - Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n - \ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n- - \"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\": - \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device - {{$labels.device}} on node {{$labels.instance}} is running full within the next - 24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk - is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h], - 3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": - \"device {{$labels.device}} on node {{$labels.instance}} is running full within - the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node - disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m], - 3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n- - \"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n - \ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration - has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading - Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"} - == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\": - \n \"description\": \"Prometheus' alert notification queue is running full - for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus' - alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m], - 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\": - \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n - \ \"annotations\": \n \"description\": \"Errors while sending alerts from - Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n - \ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\": - |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) - / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n - \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": - \"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\": - \"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} - to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while - sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) - / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n - \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n - \ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is - not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected - to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"} - < 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures - over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading - data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h]) - > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction - failures over the last four hours.\"\n \"summary\": \"Prometheus has issues - compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h]) - > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n - \ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\": - |\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\": - \"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n - \ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace - }}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus - isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m]) - <= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\": - \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate - timestamps but different values\"\n \"summary\": \"Prometheus has many samples - rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m]) - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"" -kind: ConfigMap +apiVersion: monitoring.coreos.com/v1 +kind: RuleFile metadata: labels: prometheus: k8s role: alert-rules name: prometheus-k8s-rules namespace: monitoring +spec: + groups: + - name: k8s.rules + rules: + - expr: | + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace, pod_name) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{job="kubelet",image!=""}) by (pod_name, namespace) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - name: kube-apiserver.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: | + max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (node) (sum by (node, cpu) ( + node_cpu{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + 1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilisation:avg1m + - expr: | + 1 - avg by (node) ( + rate(node_cpu{job="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilisation:avg1m + - expr: | + sum(node_load1{job="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: | + sum by (node) ( + node_load1{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: | + 1 - + sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + / + sum(node_memory_MemTotal{job="node-exporter"}) + record: ':node_memory_utilisation:' + - expr: | + sum by (node) ( + (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: | + sum by (node) ( + node_memory_MemTotal{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:node_memory_utilisation:ratio + - expr: | + 1e3 * sum( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: | + 1 - + sum by (node) ( + (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilisation:' + - expr: | + 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilisation_2:' + - expr: | + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: | + avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + record: :node_disk_utilisation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilisation:avg_irate + - expr: | + avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + record: :node_disk_saturation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: | + sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + record: :node_net_utilisation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilisation:sum_irate + - expr: | + sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + record: :node_net_saturation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) + / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kubernetes-absent + rules: + - alert: AlertmanagerDown + annotations: + message: Alertmanager has disappeared from Prometheus target discovery. + expr: | + absent(up{job="alertmanager-main"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeControllerManagerDown + annotations: + message: KubeControllerManager has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeSchedulerDown + annotations: + message: KubeScheduler has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsDown + annotations: + message: KubeStateMetrics has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-state-metrics"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + - alert: NodeExporterDown + annotations: + message: NodeExporter has disappeared from Prometheus target discovery. + expr: | + absent(up{job="node-exporter"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + expr: | + absent(up{job="prometheus-k8s"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusOperatorDown + annotations: + message: PrometheusOperator has disappeared from Prometheus target discovery. + expr: | + absent(up{job="prometheus-operator"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} / second' + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + expr: | + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation + mismatch + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica + mismatch + expr: | + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica + mismatch + expr: | + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation + mismatch + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{$value}}% of desired pods scheduled and ready for daemon set + {{$labels.namespace}}/{{$labels.daemonset}} + expr: | + kube_daemonset_status_number_ready{job="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} + are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} + are running where they are not supposed to run. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Overcommited CPU resource requests on Pods, cannot tolerate node + failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Overcommited Memory resource requests on Pods, cannot tolerate node + failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Overcommited CPU resource request quota on Namespaces. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Overcommited Memory resource request quota on Namespaces. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + / + sum(node_memory_MemTotal{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in + namespace {{ $labels.namespace }}.' + expr: | + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + kube_resourcequota{job="kube-state-metrics", type="hard"} + > 90 + for: 15m + labels: + severity: warning + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The persistent volume claimed by {{ $labels.persistentvolumeclaim + }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% + free. + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim + }} in namespace {{ $labels.namespace }} is expected to fill up within four + days. + expr: | + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{ $labels.node }} has been unready for more than an hour' + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{ $value }} different versions of Kubernetes components + running. + expr: | + count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + expr: | + sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 + / + sum(rate(rest_client_requests_total[5m])) by (instance, job) + > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + expr: | + sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to + the limit of 110. + expr: | + kubelet_running_pod_count{job="kubelet"} > 100 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is erroring for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is erroring for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 7 days. + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 1 day. + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Configuration out of sync + expr: | + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerDownOrMissing + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or missing + expr: | + label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 + for: 5m + labels: + severity: warning + - alert: AlertmanagerFailedReload + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed + expr: | + alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + for: 10m + labels: + severity: warning + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ $value }}% of {{ $labels.job }} targets are down.' + summary: Targets are down + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: DeadMansSwitch + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + expr: vector(1) + labels: + severity: none + - name: kube-prometheus-node-alerting.rules + rules: + - alert: NodeDiskRunningFull + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 24 hours + expr: | + predict_linear(node_filesystem_free{job="node-exporter"}[6h], 3600 * 24) < 0 + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 2 hours + expr: | + predict_linear(node_filesystem_free{job="node-exporter"}[30m], 3600 * 2) < 0 + for: 10m + labels: + severity: critical + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full + expr: | + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + for: 10m + labels: + severity: warning From e84b101fe769700a84f7835a76c797d1613e5483 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Jun 2018 13:09:13 +0200 Subject: [PATCH 283/638] *: Rename RuleFile to PrometheusRule --- Makefile | 2 +- .../prometheus-crd.libsonnet | 2 +- .../prometheus-operator.libsonnet | 3 +- .../prometheusrule-crd.libsonnet | 1 + .../rulefile-crd.libsonnet | 1 - .../prometheus/prometheus.libsonnet | 2 +- ...r-0prometheusCustomResourceDefinition.yaml | 86 ++--- ...rometheusruleCustomResourceDefinition.yaml | 341 ++++++++++++++++++ .../0prometheus-operator-clusterRole.yaml | 2 +- manifests/prometheus-rules.yaml | 2 +- 10 files changed, 392 insertions(+), 50 deletions(-) create mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet create mode 100644 manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml diff --git a/Makefile b/Makefile index 60c6d5f3..5b76ffc9 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ crdtojsonnet: cat ../../example/prometheus-operator-crd/alertmanager.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet cat ../../example/prometheus-operator-crd/prometheus.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet - cat ../../example/prometheus-operator-crd/rulefile.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet + cat ../../example/prometheus-operator-crd/prometheusrule.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet generate-raw: crdtojsonnet fmt jb install diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet index 10e32b6d..292fc079 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleFileNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleFileSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"prometheusRuleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 7fce1836..6e8d4b5d 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -21,6 +21,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '0alertmanagerCustomResourceDefinition': import 'alertmanager-crd.libsonnet', '0prometheusCustomResourceDefinition': import 'prometheus-crd.libsonnet', '0servicemonitorCustomResourceDefinition': import 'servicemonitor-crd.libsonnet', + '0prometheusruleCustomResourceDefinition': import 'prometheusrule-crd.libsonnet', clusterRoleBinding: local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; @@ -51,7 +52,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'prometheuses/finalizers', 'alertmanagers/finalizers', 'servicemonitors', - 'rulefiles', + 'prometheusrules', ]) + policyRule.withVerbs(['*']); diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet new file mode 100644 index 00000000..969e8109 --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet @@ -0,0 +1 @@ +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheusrules.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"PrometheusRule","plural":"prometheusrules"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"PrometheusRule defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PrometheusRuleSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"type":"string"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"type":"string"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet deleted file mode 100644 index 96ab6a00..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/rulefile-crd.libsonnet +++ /dev/null @@ -1 +0,0 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"rulefiles.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"RuleFile","plural":"rulefiles"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"RuleFile defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"RuleFileSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"type":"string"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"type":"string"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index f6503fed..c99f2a89 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -42,7 +42,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; rules: { apiVersion: 'monitoring.coreos.com/v1', - kind: 'RuleFile', + kind: 'PrometheusRule', metadata: { labels: { prometheus: $._config.prometheus.name, diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 8cc1a59c..25eaa869 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1662,6 +1662,48 @@ spec: Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids type: string + prometheusRuleSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. @@ -1899,49 +1941,7 @@ spec: the server serves requests under a different route prefix. For example for use with `kubectl proxy`. type: string - ruleFileNamespaceSelector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An empty - label selector matches all objects. A null label selector matches - no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains - values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: operator represents a key's relationship to a - set of values. Valid operators are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator - is In or NotIn, the values array must be non-empty. If the - operator is Exists or DoesNotExist, the values array must - be empty. This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: array - matchLabels: - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator is - "In", and the values array contains only "value". The requirements - are ANDed. - type: object - ruleFileSelector: + ruleNamespaceSelector: description: A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml new file mode 100644 index 00000000..43f98251 --- /dev/null +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -0,0 +1,341 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: prometheusrules.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: PrometheusRule + plural: prometheusrules + scope: Namespaced + validation: + openAPIV3Schema: + description: PrometheusRule defines alerting rules for a Prometheus instance + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources must have, + which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored with + a resource that may be set by external tools to store and retrieve + arbitrary metadata. They are not queryable and should be preserved + when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. This + is used to distinguish resources with same name and namespace in different + clusters. This field is not set anywhere right now and apiserver is + going to ignore it if set in create or update request. + type: string + creationTimestamp: + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set when + deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the registry. + Each entry is an identifier for the responsible component that will + remove the entry from the list. If the deletionTimestamp of the object + is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation of + the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the initializers + struct will be set to nil and the object is considered as initialized + and visible to all clients. + items: + description: Initializer is information about an initializer that + has not yet completed. + properties: + name: + description: name of the process that is responsible for initializing + this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, 0 if + not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object defines + what attributes will be set. Clients must ignore fields that + do not match the defined type of each attribute, and should + assume that any attribute may be empty, invalid, or under + defined. + properties: + causes: + description: The Causes array includes more details associated + with the StatusReason failure. Not all StatusReasons may + provide detailed causes. + items: + description: StatusCause provides more information about + an api.Status failure, including cases when multiple + errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the cause + of the error. This field may be presented as-is + to a reader. + type: string + reason: + description: A machine-readable description of the + cause of the error. If this value is empty there + is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may differ + from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single name + which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before the + operation should be retried. Some errors may indicate + the client must take an alternate action - for those errors + this field may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a single + resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status of this + operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic resources + must have, including lists and various status objects. A resource + may have only one of {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that the + server has more data available. The value is opaque and + may be used to issue another request to the endpoint that + served this list to retrieve the next set of available + objects. Continuing a list may not be possible if the + server configuration has changed or more than a few minutes + have passed. The resourceVersion field returned when using + this continue value will be identical to the value in + the first response. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients to + determine when objects have changed. Value must be treated + as opaque by clients and passed unmodified back to the + server. Populated by the system. Read-only. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this operation + is in the "Failure" status. If this value is empty there is + no information available. A Reason clarifies an HTTP status + code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" or + "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to organize + and categorize (scope and select) objects. May match selectors of + replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required when + creating resources, although some resources may allow a client to + request the generation of an appropriate name automatically. Name + is primarily intended for creation idempotence and configuration definition. + Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this list + will point to this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let you + identify an owning object. Currently, an owning object must be in + the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. To + set this field, a user needs "delete" permission of the owner, + otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated by + the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PrometheusRuleSpec contains specification parameters for a + Rule. + properties: + groups: + description: Content of Prometheus rule file + items: + description: RuleGroup is a list of sequentially evaluated recording + and alerting rules. + properties: + interval: + type: string + name: + type: string + rules: + items: + description: Rule describes an alerting or recording rule. + properties: + alert: + type: string + annotations: + type: object + expr: + type: string + for: + type: string + labels: + type: object + record: + type: string + required: + - expr + type: array + required: + - name + - rules + type: array + required: + - spec + version: v1 +status: + acceptedNames: + kind: "" + plural: "" + conditions: null diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index 321859ca..bad68f27 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -17,7 +17,7 @@ rules: - prometheuses/finalizers - alertmanagers/finalizers - servicemonitors - - rulefiles + - prometheusrules verbs: - '*' - apiGroups: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 8550d801..b885eb55 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1,5 +1,5 @@ apiVersion: monitoring.coreos.com/v1 -kind: RuleFile +kind: PrometheusRule metadata: labels: prometheus: k8s From 9b4f6ce56b7374af516ddd31c94d9f82038521ee Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Jun 2018 13:56:45 +0200 Subject: [PATCH 284/638] *: Remove PrometheusRuleSelector --- .../prometheus-crd.libsonnet | 2 +- ...r-0prometheusCustomResourceDefinition.yaml | 42 ------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet index 292fc079..8b16da2b 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet @@ -1 +1 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"prometheusRuleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file +{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 25eaa869..f4d73c22 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1662,48 +1662,6 @@ spec: Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids type: string - prometheusRuleSelector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An empty - label selector matches all objects. A null label selector matches - no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains - values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: operator represents a key's relationship to a - set of values. Valid operators are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator - is In or NotIn, the values array must be non-empty. If the - operator is Exists or DoesNotExist, the values array must - be empty. This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: array - matchLabels: - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator is - "In", and the values array contains only "value". The requirements - are ANDed. - type: object remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. From c13b0556ba374e49fdbcc523211d285aa26ca394 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 5 Jun 2018 11:39:38 +0200 Subject: [PATCH 285/638] *: Bump version to v0.20.0 --- .../prometheus-operator/prometheus-operator.libsonnet | 2 +- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 6e8d4b5d..4456748e 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheusOperator: 'v0.19.0', + prometheusOperator: 'v0.20.0', configmapReloader: 'v0.0.1', }, diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index e85bbe1f..3276d198 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,8 +19,8 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.19.0 - image: quay.io/coreos/prometheus-operator:v0.19.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.20.0 + image: quay.io/coreos/prometheus-operator:v0.20.0 name: prometheus-operator ports: - containerPort: 8080 From 64c73eafe55d23f155a5a64749d126d870f9954f Mon Sep 17 00:00:00 2001 From: Aleksandar Topuzovic Date: Tue, 5 Jun 2018 14:59:40 +0100 Subject: [PATCH 286/638] Ignore '/etc/*' mountpoints * Ignores '/etc/resolv.conf', '/etc/hosts' and '/etc/hostname' mountpoints * Otherwise 3 additional alerts are generated --- jsonnet/kube-prometheus/alerts/node.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index f5387a99..46a5e36d 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -11,7 +11,7 @@ summary: 'Node disk is running full within 24 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 ||| % $._config, 'for': '30m', labels: { @@ -25,7 +25,7 @@ summary: 'Node disk is running full within 2 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 ||| % $._config, 'for': '10m', labels: { From acb7ce62bf20f9e8f1839c28ec9e79b69028346c Mon Sep 17 00:00:00 2001 From: Aleksandar Topuzovic Date: Tue, 5 Jun 2018 15:05:41 +0100 Subject: [PATCH 287/638] Commit generated files --- manifests/prometheus-rules.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index b885eb55..cca1c735 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -679,7 +679,7 @@ spec: full within the next 24 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 24 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 for: 30m labels: severity: warning @@ -689,7 +689,7 @@ spec: full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 for: 10m labels: severity: critical From 1d5623d4a0f2548284a0d2bf04ab1fd3b26ac3ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 6 Jun 2018 11:19:17 +0200 Subject: [PATCH 288/638] kube-prometheus: Extract prometheus-operator jsonnet out of contrib --- Makefile | 8 +- jsonnet/kube-prometheus/jsonnetfile.json | 10 + .../alertmanager-crd.libsonnet | 1 - .../prometheus-crd.libsonnet | 1 - .../prometheus-operator.libsonnet | 178 ------------------ .../prometheusrule-crd.libsonnet | 1 - .../servicemonitor-crd.libsonnet | 1 - .../prometheus/prometheus.libsonnet | 1 + manifests/grafana-dashboardDefinitions.yaml | 30 +++ .../prometheus-serviceMonitorKubelet.yaml | 1 + 10 files changed, 43 insertions(+), 189 deletions(-) delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet delete mode 100644 jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet diff --git a/Makefile b/Makefile index 5b76ffc9..fc1e7973 100644 --- a/Makefile +++ b/Makefile @@ -7,13 +7,7 @@ generate: image @echo ">> Compiling assets and generating Kubernetes manifests" docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw -crdtojsonnet: - cat ../../example/prometheus-operator-crd/alertmanager.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet - cat ../../example/prometheus-operator-crd/prometheus.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet - cat ../../example/prometheus-operator-crd/servicemonitor.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet - cat ../../example/prometheus-operator-crd/prometheusrule.crd.yaml | gojsontoyaml -yamltojson > jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet - -generate-raw: crdtojsonnet fmt +generate-raw: fmt jb install ./build.sh diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index f9014406..a845c132 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -29,6 +29,16 @@ } }, "version": "master" + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "extract-po-jsonnet" } ] } \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet deleted file mode 100644 index 4a422463..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/alertmanager-crd.libsonnet +++ /dev/null @@ -1 +0,0 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"alertmanagers.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Alertmanager","plural":"alertmanagers"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Alertmanager describes an Alertmanager cluster.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"baseImage":{"description":"Base image that is used to deploy pods, without tag.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"externalUrl":{"description":"The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP. Note this is only for the Alertmanager UI, not the gossip communication.","type":"boolean"},"logLevel":{"description":"Log level for Alertmanager to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"replicas":{"description":"Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster equal to the expected size.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"routePrefix":{"description":"The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/\u003csecret-name\u003e.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version the cluster should be on.","type":"string"}}},"status":{"description":"Most recent observed status of the Alertmanager cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Alertmanager cluster.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Alertmanager cluster that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet deleted file mode 100644 index 8b16da2b..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-crd.libsonnet +++ /dev/null @@ -1 +0,0 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheuses.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"Prometheus","plural":"prometheuses"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"Prometheus defines a Prometheus deployment.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"Specification of the desired behavior of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"additionalAlertManagerConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"additionalScrapeConfigs":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"affinity":{"description":"Affinity is a group of affinity scheduling rules.","properties":{"nodeAffinity":{"description":"Node affinity is a group of node affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred.","items":{"description":"An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).","properties":{"preference":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"weight":{"description":"Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","preference"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"A node selector represents the union of the results of one or more label queries over a set of nodes; that is, it represents the OR of the selectors represented by the node selector terms.","properties":{"nodeSelectorTerms":{"description":"Required. A list of node selector terms. The terms are ORed.","items":{"description":"A null or empty node selector term matches no objects.","properties":{"matchExpressions":{"description":"Required. A list of node selector requirements. The requirements are ANDed.","items":{"description":"A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"The label key that the selector applies to.","type":"string"},"operator":{"description":"Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.","type":"string"},"values":{"description":"An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"}},"required":["matchExpressions"]},"type":"array"}},"required":["nodeSelectorTerms"]}}},"podAffinity":{"description":"Pod affinity is a group of inter pod affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}},"podAntiAffinity":{"description":"Pod anti affinity is a group of inter pod anti affinity scheduling rules.","properties":{"preferredDuringSchedulingIgnoredDuringExecution":{"description":"The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding \"weight\" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred.","items":{"description":"The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s)","properties":{"podAffinityTerm":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"weight":{"description":"weight associated with matching the corresponding podAffinityTerm, in the range 1-100.","format":"int32","type":"integer"}},"required":["weight","podAffinityTerm"]},"type":"array"},"requiredDuringSchedulingIgnoredDuringExecution":{"description":"If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied.","items":{"description":"Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key \u003ctopologyKey\u003e matches that of any node on which a pod of the set of pods is running","properties":{"labelSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"namespaces":{"description":"namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means \"this pod's namespace\"","items":{"type":"string"},"type":"array"},"topologyKey":{"description":"This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed.","type":"string"}},"required":["topologyKey"]},"type":"array"}}}}},"alerting":{"description":"AlertingSpec defines parameters for alerting configuration of Prometheus servers.","properties":{"alertmanagers":{"description":"AlertmanagerEndpoints Prometheus should fire alerts against.","items":{"description":"AlertmanagerEndpoints defines a selection of a single Endpoints object containing alertmanager IPs to fire alerts against.","properties":{"bearerTokenFile":{"description":"BearerTokenFile to read from filesystem to use when authenticating to Alertmanager.","type":"string"},"name":{"description":"Name of Endpoints object in Namespace.","type":"string"},"namespace":{"description":"Namespace of Endpoints object.","type":"string"},"pathPrefix":{"description":"Prefix for the HTTP path alerts are pushed to.","type":"string"},"port":{},"scheme":{"description":"Scheme to use when firing alerts.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}},"required":["namespace","name","port"]},"type":"array"}},"required":["alertmanagers"]},"baseImage":{"description":"Base image to use for a Prometheus deployment.","type":"string"},"containers":{"description":"Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.","items":{"description":"A single application container that you want to run within a pod.","properties":{"args":{"description":"Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"command":{"description":"Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell","items":{"type":"string"},"type":"array"},"env":{"description":"List of environment variables to set in the container. Cannot be updated.","items":{"description":"EnvVar represents an environment variable present in a Container.","properties":{"name":{"description":"Name of the environment variable. Must be a C_IDENTIFIER.","type":"string"},"value":{"description":"Variable references $(VAR_NAME) are expanded using the previous defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. The $(VAR_NAME) syntax can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".","type":"string"},"valueFrom":{"description":"EnvVarSource represents a source for the value of an EnvVar.","properties":{"configMapKeyRef":{"description":"Selects a key from a ConfigMap.","properties":{"key":{"description":"The key to select.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap or it's key must be defined","type":"boolean"}},"required":["key"]},"fieldRef":{"description":"ObjectFieldSelector selects an APIVersioned field of an object.","properties":{"apiVersion":{"description":"Version of the schema the FieldPath is written in terms of, defaults to \"v1\".","type":"string"},"fieldPath":{"description":"Path of the field to select in the specified API version.","type":"string"}},"required":["fieldPath"]},"resourceFieldRef":{"description":"ResourceFieldSelector represents container resources (cpu, memory) and their output format","properties":{"containerName":{"description":"Container name: required for volumes, optional for env vars","type":"string"},"divisor":{},"resource":{"description":"Required: resource to select","type":"string"}},"required":["resource"]},"secretKeyRef":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}}},"required":["name"]},"type":"array"},"envFrom":{"description":"List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated.","items":{"description":"EnvFromSource represents the source of a set of ConfigMaps","properties":{"configMapRef":{"description":"ConfigMapEnvSource selects a ConfigMap to populate the environment variables with.\n\nThe contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the ConfigMap must be defined","type":"boolean"}}},"prefix":{"description":"An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER.","type":"string"},"secretRef":{"description":"SecretEnvSource selects a Secret to populate the environment variables with.\n\nThe contents of the target Secret's Data field will represent the key-value pairs as environment variables.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret must be defined","type":"boolean"}}}}},"type":"array"},"image":{"description":"Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.","type":"string"},"imagePullPolicy":{"description":"Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images","type":"string"},"lifecycle":{"description":"Lifecycle describes actions that the management system should take in response to container lifecycle events. For the PostStart and PreStop lifecycle handlers, management of the container blocks until the action is complete, unless the container process fails, in which case the handler is aborted.","properties":{"postStart":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}},"preStop":{"description":"Handler defines a specific action that should be taken","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]}}}}},"livenessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"name":{"description":"Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated.","type":"string"},"ports":{"description":"List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default \"0.0.0.0\" address inside a container will be accessible from the network. Cannot be updated.","items":{"description":"ContainerPort represents a network port in a single container.","properties":{"containerPort":{"description":"Number of port to expose on the pod's IP address. This must be a valid port number, 0 \u003c x \u003c 65536.","format":"int32","type":"integer"},"hostIP":{"description":"What host IP to bind the external port to.","type":"string"},"hostPort":{"description":"Number of port to expose on the host. If specified, this must be a valid port number, 0 \u003c x \u003c 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this.","format":"int32","type":"integer"},"name":{"description":"If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services.","type":"string"},"protocol":{"description":"Protocol for port. Must be UDP or TCP. Defaults to \"TCP\".","type":"string"}},"required":["containerPort"]},"type":"array"},"readinessProbe":{"description":"Probe describes a health check to be performed against a container to determine whether it is alive or ready to receive traffic.","properties":{"exec":{"description":"ExecAction describes a \"run in container\" action.","properties":{"command":{"description":"Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy.","items":{"type":"string"},"type":"array"}}},"failureThreshold":{"description":"Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1.","format":"int32","type":"integer"},"httpGet":{"description":"HTTPGetAction describes an action based on HTTP Get requests.","properties":{"host":{"description":"Host name to connect to, defaults to the pod IP. You probably want to set \"Host\" in httpHeaders instead.","type":"string"},"httpHeaders":{"description":"Custom headers to set in the request. HTTP allows repeated headers.","items":{"description":"HTTPHeader describes a custom header to be used in HTTP probes","properties":{"name":{"description":"The header field name","type":"string"},"value":{"description":"The header field value","type":"string"}},"required":["name","value"]},"type":"array"},"path":{"description":"Path to access on the HTTP server.","type":"string"},"port":{},"scheme":{"description":"Scheme to use for connecting to the host. Defaults to HTTP.","type":"string"}},"required":["port"]},"initialDelaySeconds":{"description":"Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"},"periodSeconds":{"description":"How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1.","format":"int32","type":"integer"},"successThreshold":{"description":"Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness. Minimum value is 1.","format":"int32","type":"integer"},"tcpSocket":{"description":"TCPSocketAction describes an action based on opening a socket","properties":{"host":{"description":"Optional: Host name to connect to, defaults to the pod IP.","type":"string"},"port":{}},"required":["port"]},"timeoutSeconds":{"description":"Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes","format":"int32","type":"integer"}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"securityContext":{"description":"SecurityContext holds security configuration that will be applied to a container. Some fields are present in both SecurityContext and PodSecurityContext. When both are set, the values in SecurityContext take precedence.","properties":{"allowPrivilegeEscalation":{"description":"AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN","type":"boolean"},"capabilities":{"description":"Adds and removes POSIX capabilities from running containers.","properties":{"add":{"description":"Added capabilities","items":{"type":"string"},"type":"array"},"drop":{"description":"Removed capabilities","items":{"type":"string"},"type":"array"}}},"privileged":{"description":"Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false.","type":"boolean"},"readOnlyRootFilesystem":{"description":"Whether this container has a read-only root filesystem. Default is false.","type":"boolean"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}}}},"stdin":{"description":"Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false.","type":"boolean"},"stdinOnce":{"description":"Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false","type":"boolean"},"terminationMessagePath":{"description":"Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.","type":"string"},"terminationMessagePolicy":{"description":"Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated.","type":"string"},"tty":{"description":"Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false.","type":"boolean"},"volumeDevices":{"description":"volumeDevices is the list of block devices to be used by the container. This is an alpha feature and may change in the future.","items":{"description":"volumeDevice describes a mapping of a raw block device within a container.","properties":{"devicePath":{"description":"devicePath is the path inside of the container that the device will be mapped to.","type":"string"},"name":{"description":"name must match the name of a persistentVolumeClaim in the pod","type":"string"}},"required":["name","devicePath"]},"type":"array"},"volumeMounts":{"description":"Pod volumes to mount into the container's filesystem. Cannot be updated.","items":{"description":"VolumeMount describes a mounting of a Volume within a container.","properties":{"mountPath":{"description":"Path within the container at which the volume should be mounted. Must not contain ':'.","type":"string"},"mountPropagation":{"description":"mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationHostToContainer is used. This field is beta in 1.10.","type":"string"},"name":{"description":"This must match the Name of a Volume.","type":"string"},"readOnly":{"description":"Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false.","type":"boolean"},"subPath":{"description":"Path within the volume from which the container's volume should be mounted. Defaults to \"\" (volume's root).","type":"string"}},"required":["name","mountPath"]},"type":"array"},"workingDir":{"description":"Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated.","type":"string"}},"required":["name"]},"type":"array"},"evaluationInterval":{"description":"Interval between consecutive evaluations.","type":"string"},"externalLabels":{"description":"The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager).","type":"object"},"externalUrl":{"description":"The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name.","type":"string"},"imagePullSecrets":{"description":"An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod","items":{"description":"LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace.","properties":{"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"}}},"type":"array"},"listenLocal":{"description":"ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.","type":"boolean"},"logLevel":{"description":"Log level for Prometheus to be configured with.","type":"string"},"nodeSelector":{"description":"Define which Nodes the Pods are scheduled on.","type":"object"},"paused":{"description":"When a Prometheus deployment is paused, no actions except for deletion will be performed on the underlying objects.","type":"boolean"},"podMetadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"remoteRead":{"description":"If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteReadSpec defines the remote_read configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"bearer token for remote read.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote read.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"readRecent":{"description":"Whether reads should be made for queries for time ranges that the local storage should have complete data for.","type":"boolean"},"remoteTimeout":{"description":"Timeout for requests to the remote read endpoint.","type":"string"},"requiredMatchers":{"description":"An optional list of equality matchers which have to be present in a selector to query the remote read endpoint.","type":"object"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"}},"required":["url"]},"type":"array"},"remoteWrite":{"description":"If specified, the remote_write spec. This is an experimental feature, it may change in any upcoming release in a breaking way.","items":{"description":"RemoteWriteSpec defines the remote_write configuration for prometheus.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerToken":{"description":"File to read bearer token for remote write.","type":"string"},"bearerTokenFile":{"description":"File to read bearer token for remote write.","type":"string"},"proxyUrl":{"description":"Optional ProxyURL","type":"string"},"remoteTimeout":{"description":"Timeout for requests to the remote write endpoint.","type":"string"},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}},"url":{"description":"The URL of the endpoint to send samples to.","type":"string"},"writeRelabelConfigs":{"description":"The list of remote write relabel configurations.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"}},"required":["url"]},"type":"array"},"replicas":{"description":"Number of instances to deploy for a Prometheus deployment.","format":"int32","type":"integer"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"retention":{"description":"Time duration Prometheus shall retain data for.","type":"string"},"routePrefix":{"description":"The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`.","type":"string"},"ruleNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"ruleSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"scrapeInterval":{"description":"Interval between consecutive scrapes.","type":"string"},"secrets":{"description":"Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/\u003csecret-name\u003e. Secrets changes after initial creation of a Prometheus object are not reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated with the new list of secrets.","items":{"type":"string"},"type":"array"},"securityContext":{"description":"PodSecurityContext holds pod-level security attributes and common container settings. Some fields are also present in container.securityContext. Field values of container.securityContext take precedence over field values of PodSecurityContext.","properties":{"fsGroup":{"description":"A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod:\n\n1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.","format":"int64","type":"integer"},"runAsGroup":{"description":"The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"runAsNonRoot":{"description":"Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.","type":"boolean"},"runAsUser":{"description":"The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container.","format":"int64","type":"integer"},"seLinuxOptions":{"description":"SELinuxOptions are the labels to be applied to the container","properties":{"level":{"description":"Level is SELinux level label that applies to the container.","type":"string"},"role":{"description":"Role is a SELinux role label that applies to the container.","type":"string"},"type":{"description":"Type is a SELinux type label that applies to the container.","type":"string"},"user":{"description":"User is a SELinux user label that applies to the container.","type":"string"}}},"supplementalGroups":{"description":"A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container.","items":{"format":"int64","type":"integer"},"type":"array"}}},"serviceAccountName":{"description":"ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods.","type":"string"},"serviceMonitorNamespaceSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"serviceMonitorSelector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storage":{"description":"StorageSpec defines the configured storage for a group Prometheus servers.","properties":{"class":{"description":"Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses DEPRECATED","type":"string"},"emptyDir":{"description":"Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling.","properties":{"medium":{"description":"What type of storage medium should back this directory. The default is \"\" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir","type":"string"},"sizeLimit":{}}},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"volumeClaimTemplate":{"description":"PersistentVolumeClaim is a user's request for and claim to a persistent volume","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes","properties":{"accessModes":{"description":"AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"resources":{"description":"ResourceRequirements describes the compute resource requirements.","properties":{"limits":{"description":"Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"},"requests":{"description":"Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/","type":"object"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"storageClassName":{"description":"Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1","type":"string"},"volumeMode":{"description":"volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. This is an alpha feature and may change in the future.","type":"string"},"volumeName":{"description":"VolumeName is the binding reference to the PersistentVolume backing this claim.","type":"string"}}},"status":{"description":"PersistentVolumeClaimStatus is the current status of a persistent volume claim.","properties":{"accessModes":{"description":"AccessModes contains the actual access modes the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1","items":{"type":"string"},"type":"array"},"capacity":{"description":"Represents the actual resources of the underlying volume.","type":"object"},"conditions":{"description":"Current Condition of persistent volume claim. If underlying persistent volume is being resized then the Condition will be set to 'ResizeStarted'.","items":{"description":"PersistentVolumeClaimCondition contails details about state of pvc","properties":{"lastProbeTime":{"format":"date-time","type":"string"},"lastTransitionTime":{"format":"date-time","type":"string"},"message":{"description":"Human-readable message indicating details about last transition.","type":"string"},"reason":{"description":"Unique, this should be a short, machine understandable string that gives the reason for condition's last transition. If it reports \"ResizeStarted\" that means the underlying persistent volume is being resized.","type":"string"},"status":{"type":"string"},"type":{"type":"string"}},"required":["type","status"]},"type":"array"},"phase":{"description":"Phase represents the current phase of PersistentVolumeClaim.","type":"string"}}}}}}},"tolerations":{"description":"If specified, the pod's tolerations.","items":{"description":"The pod this Toleration is attached to tolerates any taint that matches the triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.","properties":{"effect":{"description":"Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.","type":"string"},"key":{"description":"Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.","type":"string"},"operator":{"description":"Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.","type":"string"},"tolerationSeconds":{"description":"TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.","format":"int64","type":"integer"},"value":{"description":"Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.","type":"string"}}},"type":"array"},"version":{"description":"Version of Prometheus to be deployed.","type":"string"}}},"status":{"description":"Most recent observed status of the Prometheus cluster. Read-only. Not included when requesting from the apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status","properties":{"availableReplicas":{"description":"Total number of available pods (ready for at least minReadySeconds) targeted by this Prometheus deployment.","format":"int32","type":"integer"},"paused":{"description":"Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed.","type":"boolean"},"replicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector).","format":"int32","type":"integer"},"unavailableReplicas":{"description":"Total number of unavailable pods targeted by this Prometheus deployment.","format":"int32","type":"integer"},"updatedReplicas":{"description":"Total number of non-terminated pods targeted by this Prometheus deployment that have the desired version spec.","format":"int32","type":"integer"}},"required":["paused","replicas","updatedReplicas","availableReplicas","unavailableReplicas"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet deleted file mode 100644 index 4456748e..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ /dev/null @@ -1,178 +0,0 @@ -local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - prometheusOperator: 'v0.20.0', - configmapReloader: 'v0.0.1', - }, - - imageRepos+:: { - prometheusOperator: 'quay.io/coreos/prometheus-operator', - configmapReloader: 'quay.io/coreos/configmap-reload', - prometheusConfigReloader: 'quay.io/coreos/prometheus-config-reloader', - }, - }, - - prometheusOperator+:: { - // Prefixing with 0 to ensure these manifests are listed and therefore created first. - '0alertmanagerCustomResourceDefinition': import 'alertmanager-crd.libsonnet', - '0prometheusCustomResourceDefinition': import 'prometheus-crd.libsonnet', - '0servicemonitorCustomResourceDefinition': import 'servicemonitor-crd.libsonnet', - '0prometheusruleCustomResourceDefinition': import 'prometheusrule-crd.libsonnet', - - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('prometheus-operator') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('prometheus-operator') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-operator', namespace: $._config.namespace }]), - - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local apiExtensionsRule = policyRule.new() + - policyRule.withApiGroups(['apiextensions.k8s.io']) + - policyRule.withResources([ - 'customresourcedefinitions', - ]) + - policyRule.withVerbs(['*']); - - local monitoringRule = policyRule.new() + - policyRule.withApiGroups(['monitoring.coreos.com']) + - policyRule.withResources([ - 'alertmanagers', - 'prometheuses', - 'prometheuses/finalizers', - 'alertmanagers/finalizers', - 'servicemonitors', - 'prometheusrules', - ]) + - policyRule.withVerbs(['*']); - - local appsRule = policyRule.new() + - policyRule.withApiGroups(['apps']) + - policyRule.withResources([ - 'statefulsets', - ]) + - policyRule.withVerbs(['*']); - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'configmaps', - 'secrets', - ]) + - policyRule.withVerbs(['*']); - - local podRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'pods', - ]) + - policyRule.withVerbs(['list', 'delete']); - - local routingRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'services', - 'endpoints', - ]) + - policyRule.withVerbs(['get', 'create', 'update']); - - local nodeRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - ]) + - policyRule.withVerbs(['list', 'watch']); - - local namespaceRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'namespaces', - ]) + - policyRule.withVerbs(['list', 'watch']); - - local rules = [apiExtensionsRule, monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('prometheus-operator') + - clusterRole.withRules(rules), - - deployment: - local deployment = k.apps.v1beta2.deployment; - local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; - local containerPort = container.portsType; - - local targetPort = 8080; - local podLabels = { 'k8s-app': 'prometheus-operator' }; - - local operatorContainer = - container.new('prometheus-operator', $._config.imageRepos.prometheusOperator + ':' + $._config.versions.prometheusOperator) + - container.withPorts(containerPort.newNamed('http', targetPort)) + - container.withArgs([ - '--kubelet-service=kube-system/kubelet', - '--config-reloader-image=' + $._config.imageRepos.configmapReloader + ':' + $._config.versions.configmapReloader, - '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusOperator, - ]) + - container.mixin.resources.withRequests({ cpu: '100m', memory: '50Mi' }) + - container.mixin.resources.withLimits({ cpu: '200m', memory: '100Mi' }); - - deployment.new('prometheus-operator', 1, operatorContainer, podLabels) + - deployment.mixin.metadata.withNamespace($._config.namespace) + - deployment.mixin.metadata.withLabels(podLabels) + - deployment.mixin.spec.selector.withMatchLabels(podLabels) + - deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + - deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - deployment.mixin.spec.template.spec.withServiceAccountName('prometheus-operator'), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('prometheus-operator') + - serviceAccount.mixin.metadata.withNamespace($._config.namespace), - - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - local poServicePort = servicePort.newNamed('http', 8080, 'http'); - - service.new('prometheus-operator', $.prometheusOperator.deployment.spec.selector.matchLabels, [poServicePort]) + - service.mixin.metadata.withLabels({ 'k8s-app': 'prometheus-operator' }) + - service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.spec.withClusterIp('None'), - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'prometheus-operator', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'prometheus-operator', - }, - }, - spec: { - endpoints: [ - { - port: 'http', - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'prometheus-operator', - }, - }, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet deleted file mode 100644 index 969e8109..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheusrule-crd.libsonnet +++ /dev/null @@ -1 +0,0 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"prometheusrules.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"PrometheusRule","plural":"prometheusrules"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"PrometheusRule defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"metadata":{"description":"ObjectMeta is metadata that all persisted resources must have, which includes all objects users must create.","properties":{"annotations":{"description":"Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations","type":"object"},"clusterName":{"description":"The name of the cluster which the object belongs to. This is used to distinguish resources with same name and namespace in different clusters. This field is not set anywhere right now and apiserver is going to ignore it if set in create or update request.","type":"string"},"creationTimestamp":{"format":"date-time","type":"string"},"deletionGracePeriodSeconds":{"description":"Number of seconds allowed for this object to gracefully terminate before it will be removed from the system. Only set when deletionTimestamp is also set. May only be shortened. Read-only.","format":"int64","type":"integer"},"deletionTimestamp":{"format":"date-time","type":"string"},"finalizers":{"description":"Must be empty before the object is deleted from the registry. Each entry is an identifier for the responsible component that will remove the entry from the list. If the deletionTimestamp of the object is non-nil, entries in this list can only be removed.","items":{"type":"string"},"type":"array"},"generateName":{"description":"GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.\n\nIf this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).\n\nApplied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency","type":"string"},"generation":{"description":"A sequence number representing a specific generation of the desired state. Populated by the system. Read-only.","format":"int64","type":"integer"},"initializers":{"description":"Initializers tracks the progress of initialization.","properties":{"pending":{"description":"Pending is a list of initializers that must execute in order before this object is visible. When the last pending initializer is removed, and no failing result is set, the initializers struct will be set to nil and the object is considered as initialized and visible to all clients.","items":{"description":"Initializer is information about an initializer that has not yet completed.","properties":{"name":{"description":"name of the process that is responsible for initializing this object.","type":"string"}},"required":["name"]},"type":"array"},"result":{"description":"Status is a return value for calls that don't return other objects.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"code":{"description":"Suggested HTTP return code for this status, 0 if not set.","format":"int32","type":"integer"},"details":{"description":"StatusDetails is a set of additional properties that MAY be set by the server to provide additional information about a response. The Reason field of a Status object defines what attributes will be set. Clients must ignore fields that do not match the defined type of each attribute, and should assume that any attribute may be empty, invalid, or under defined.","properties":{"causes":{"description":"The Causes array includes more details associated with the StatusReason failure. Not all StatusReasons may provide detailed causes.","items":{"description":"StatusCause provides more information about an api.Status failure, including cases when multiple errors are encountered.","properties":{"field":{"description":"The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional.\n\nExamples:\n \"name\" - the field \"name\" on the current resource\n \"items[0].name\" - the field \"name\" on the first array entry in \"items\"","type":"string"},"message":{"description":"A human-readable description of the cause of the error. This field may be presented as-is to a reader.","type":"string"},"reason":{"description":"A machine-readable description of the cause of the error. If this value is empty there is no information available.","type":"string"}}},"type":"array"},"group":{"description":"The group attribute of the resource associated with the status StatusReason.","type":"string"},"kind":{"description":"The kind attribute of the resource associated with the status StatusReason. On some operations may differ from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"The name attribute of the resource associated with the status StatusReason (when there is a single name which can be described).","type":"string"},"retryAfterSeconds":{"description":"If specified, the time in seconds before the operation should be retried. Some errors may indicate the client must take an alternate action - for those errors this field may indicate how long to wait before taking the alternate action.","format":"int32","type":"integer"},"uid":{"description":"UID of the resource. (when there is a single resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"message":{"description":"A human-readable description of the status of this operation.","type":"string"},"metadata":{"description":"ListMeta describes metadata that synthetic resources must have, including lists and various status objects. A resource may have only one of {ObjectMeta, ListMeta}.","properties":{"continue":{"description":"continue may be set if the user set a limit on the number of items returned, and indicates that the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available objects. Continuing a list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical to the value in the first response.","type":"string"},"resourceVersion":{"description":"String that identifies the server's internal version of this object that can be used by clients to determine when objects have changed. Value must be treated as opaque by clients and passed unmodified back to the server. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"selfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"}}},"reason":{"description":"A machine-readable description of why this operation is in the \"Failure\" status. If this value is empty there is no information available. A Reason clarifies an HTTP status code but does not override it.","type":"string"},"status":{"description":"Status of the operation. One of: \"Success\" or \"Failure\". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status","type":"string"}}}},"required":["pending"]},"labels":{"description":"Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels","type":"object"},"name":{"description":"Name must be unique within a namespace. Is required when creating resources, although some resources may allow a client to request the generation of an appropriate name automatically. Name is primarily intended for creation idempotence and configuration definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"namespace":{"description":"Namespace defines the space within each name must be unique. An empty namespace is equivalent to the \"default\" namespace, but \"default\" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.\n\nMust be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces","type":"string"},"ownerReferences":{"description":"List of objects depended by this object. If ALL objects in the list have been deleted, this object will be garbage collected. If this object is managed by a controller, then an entry in this list will point to this controller, with the controller field set to true. There cannot be more than one managing controller.","items":{"description":"OwnerReference contains enough information to let you identify an owning object. Currently, an owning object must be in the same namespace, so there is no namespace field.","properties":{"apiVersion":{"description":"API version of the referent.","type":"string"},"blockOwnerDeletion":{"description":"If true, AND if the owner has the \"foregroundDeletion\" finalizer, then the owner cannot be deleted from the key-value store until this reference is removed. Defaults to false. To set this field, a user needs \"delete\" permission of the owner, otherwise 422 (Unprocessable Entity) will be returned.","type":"boolean"},"controller":{"description":"If true, this reference points to the managing controller.","type":"boolean"},"kind":{"description":"Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"name":{"description":"Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names","type":"string"},"uid":{"description":"UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}},"required":["apiVersion","kind","name","uid"]},"type":"array"},"resourceVersion":{"description":"An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.\n\nPopulated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency","type":"string"},"selfLink":{"description":"SelfLink is a URL representing this object. Populated by the system. Read-only.","type":"string"},"uid":{"description":"UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.\n\nPopulated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids","type":"string"}}},"spec":{"description":"PrometheusRuleSpec contains specification parameters for a Rule.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules.","properties":{"interval":{"type":"string"},"name":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"type":"object"},"expr":{"type":"string"},"for":{"type":"string"},"labels":{"type":"object"},"record":{"type":"string"}},"required":["expr"]},"type":"array"}},"required":["name","rules"]},"type":"array"}}}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet deleted file mode 100644 index 94a82be0..00000000 --- a/jsonnet/kube-prometheus/prometheus-operator/servicemonitor-crd.libsonnet +++ /dev/null @@ -1 +0,0 @@ -{"apiVersion":"apiextensions.k8s.io/v1beta1","kind":"CustomResourceDefinition","metadata":{"creationTimestamp":null,"name":"servicemonitors.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"ServiceMonitor","plural":"servicemonitors"},"scope":"Namespaced","validation":{"openAPIV3Schema":{"description":"ServiceMonitor defines monitoring for a set of services.","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds","type":"string"},"spec":{"description":"ServiceMonitorSpec contains specification parameters for a ServiceMonitor.","properties":{"endpoints":{"description":"A list of endpoints allowed as part of this ServiceMonitor.","items":{"description":"Endpoint defines a scrapeable endpoint serving Prometheus metrics.","properties":{"basicAuth":{"description":"BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints","properties":{"password":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]},"username":{"description":"SecretKeySelector selects a key of a Secret.","properties":{"key":{"description":"The key of the secret to select from. Must be a valid secret key.","type":"string"},"name":{"description":"Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names","type":"string"},"optional":{"description":"Specify whether the Secret or it's key must be defined","type":"boolean"}},"required":["key"]}}},"bearerTokenFile":{"description":"File to read bearer token for scraping targets.","type":"string"},"honorLabels":{"description":"HonorLabels chooses the metric's labels on collisions with target labels.","type":"boolean"},"interval":{"description":"Interval at which metrics should be scraped","type":"string"},"metricRelabelings":{"description":"MetricRelabelConfigs to apply to samples before ingestion.","items":{"description":"RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `\u003cmetric_relabel_configs\u003e`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs","properties":{"action":{"description":"Action to perform based on regex matching. Default is 'replace'","type":"string"},"modulus":{"description":"Modulus to take of the hash of the source label values.","format":"int64","type":"integer"},"regex":{"description":"Regular expression against which the extracted value is matched. defailt is '(.*)'","type":"string"},"replacement":{"description":"Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1'","type":"string"},"separator":{"description":"Separator placed between concatenated source label values. default is ';'.","type":"string"},"sourceLabels":{"description":"The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions.","items":{"type":"string"},"type":"array"},"targetLabel":{"description":"Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available.","type":"string"}}},"type":"array"},"params":{"description":"Optional HTTP URL parameters","type":"object"},"path":{"description":"HTTP path to scrape for metrics.","type":"string"},"port":{"description":"Name of the service port this endpoint refers to. Mutually exclusive with targetPort.","type":"string"},"proxyUrl":{"description":"ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint.","type":"string"},"scheme":{"description":"HTTP scheme to use for scraping.","type":"string"},"scrapeTimeout":{"description":"Timeout after which the scrape is ended","type":"string"},"targetPort":{},"tlsConfig":{"description":"TLSConfig specifies TLS configuration parameters.","properties":{"caFile":{"description":"The CA cert to use for the targets.","type":"string"},"certFile":{"description":"The client cert file for the targets.","type":"string"},"insecureSkipVerify":{"description":"Disable target certificate validation.","type":"boolean"},"keyFile":{"description":"The client key file for the targets.","type":"string"},"serverName":{"description":"Used to verify the hostname for the targets.","type":"string"}}}}},"type":"array"},"jobLabel":{"description":"The label to use to retrieve the job name from.","type":"string"},"namespaceSelector":{"description":"A selector for selecting namespaces either selecting all namespaces or a list of namespaces.","properties":{"any":{"description":"Boolean describing whether all namespaces are selected in contrast to a list restricting them.","type":"boolean"},"matchNames":{"description":"List of namespace names.","items":{"type":"string"},"type":"array"}}},"selector":{"description":"A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.","properties":{"matchExpressions":{"description":"matchExpressions is a list of label selector requirements. The requirements are ANDed.","items":{"description":"A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values.","properties":{"key":{"description":"key is the label key that the selector applies to.","type":"string"},"operator":{"description":"operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.","type":"string"},"values":{"description":"values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.","items":{"type":"string"},"type":"array"}},"required":["key","operator"]},"type":"array"},"matchLabels":{"description":"matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is \"key\", the operator is \"In\", and the values array contains only \"value\". The requirements are ANDed.","type":"object"}}},"targetLabels":{"description":"TargetLabels transfers labels on the Kubernetes Service onto the target.","items":{"type":"string"},"type":"array"}},"required":["endpoints","selector"]}},"required":["spec"]}},"version":"v1"},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":null}} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index c99f2a89..f1758cf7 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -306,6 +306,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; port: 'https-metrics', scheme: 'https', interval: '30s', + honorLabels: true, tlsConfig: { insecureSkipVerify: true, }, diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index f4058562..7f6bc8f5 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4707,6 +4707,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -4795,6 +4798,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -4911,6 +4917,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5114,6 +5123,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5317,6 +5329,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5405,6 +5420,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5609,6 +5627,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5725,6 +5746,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -5827,6 +5851,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -6668,6 +6695,9 @@ data: }, "lines": true, "linewidth": 1, + "links": [ + + ], "nullPointMode": "null", "percentage": false, "pointradius": 5, diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml index 06ec7fc8..97d7f1a1 100644 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -8,6 +8,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true interval: 30s port: https-metrics scheme: https From 8601fb7bea2a754473344293cd8f7a5247e7c3f8 Mon Sep 17 00:00:00 2001 From: Dmitry Mishin Date: Wed, 6 Jun 2018 11:42:24 -0700 Subject: [PATCH 289/638] Added section about grafana config #1436 --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 9256319c..c71d0743 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,19 @@ A hidden `_config` field is located at the top level of the object this library } ``` +The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same file. F.e. to allow anonymous access to grafana, add the `_config` section: + +``` + grafana+:: { + config: { + sections: { + "auth.anonymous": {enabled: true}, + }, + }, + }, +``` + + ## Customization Jsonnet is a turing complete language, any logic can be reflected in it. It also has powerful merge functionalities, allowing sophisticated customizations of any kind simply by merging it into the object the library provides. From 502044ff93380486695523e06bb13093cfa9dc1c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 7 Jun 2018 10:01:23 +0200 Subject: [PATCH 290/638] *: Use all jsonnet from master branch --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index a845c132..a7b0a203 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "extract-po-jsonnet" + "version": "master" } ] } \ No newline at end of file From 9ecfa96875797f3b0450e295d7338d19f69723ce Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 7 Jun 2018 10:35:52 +0200 Subject: [PATCH 291/638] kube-prometheus: Add troubleshooting section --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index c71d0743..0ff7e201 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,18 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) * [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) * [Minikube Example](#minikube-example) +* [Troubleshooting](#troubleshooting) + * [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics) ## Prerequisites You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control. +This means the kubelet configuration must contain these flags: + +* `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s). +* `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allow to access a resource, in specific for this project the `/metrics` endpoint. + ### minikube In order to just try out this stack, start minikube with the following command: @@ -269,3 +276,19 @@ local kp = { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` + +## Troubleshooting + +### Error retrieving kubelet metrics + +Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets. + +As described in the [prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. + +#### Authentication problem + +The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations. + +#### Authorization problem + +The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. From 62fa823e3ef22062a9a155d4a89b807393550248 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 7 Jun 2018 11:08:12 +0200 Subject: [PATCH 292/638] kube-prometheus: Add docs on how to set Alertmanager configuration --- README.md | 44 +++++++++++++++++++ examples/alertmanager-config-external.jsonnet | 7 +++ examples/alertmanager-config.jsonnet | 22 ++++++++++ examples/alertmanager-config.yaml | 15 +++++++ .../alertmanager/alertmanager.libsonnet | 19 ++++++-- manifests/alertmanager-secret.yaml | 2 +- 6 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 examples/alertmanager-config-external.jsonnet create mode 100644 examples/alertmanager-config.jsonnet create mode 100644 examples/alertmanager-config.yaml diff --git a/README.md b/README.md index 0ff7e201..89686cf4 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Compiling](#compiling) * [Configuration](#configuration) * [Customization](#customization) + * [Alertmanager configuration](#alertmanager-configuration) * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) * [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) * [Minikube Example](#minikube-example) @@ -242,6 +243,49 @@ local daemonset = k.apps.v1beta2.daemonSet; }).nodeExporter.daemonset ``` +### Alertmanager configuration + +The Alertmanager configuration is located in the `_config.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field. + +[embedmd]:# (examples/alertmanager-config.jsonnet) +```jsonnet +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + alertmanager+: { + config: ||| + global: + resolve_timeout: 10m + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' + receivers: + - name: 'null' + |||, + }, + }, + }).alertmanager.secret +``` + +In the above example the configuration has been inlined, but can just as well be an external file imported in jsonnet via the `importstr` function. + +[embedmd]:# (examples/alertmanager-config-external.jsonnet) +```jsonnet +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + alertmanager+: { + config: importstr 'alertmanager-config.yaml', + }, + }, + }).alertmanager.secret +``` + ### Customizing Prometheus alerting/recording rules and Grafana dashboards See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. diff --git a/examples/alertmanager-config-external.jsonnet b/examples/alertmanager-config-external.jsonnet new file mode 100644 index 00000000..c2b34cca --- /dev/null +++ b/examples/alertmanager-config-external.jsonnet @@ -0,0 +1,7 @@ +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + alertmanager+: { + config: importstr 'alertmanager-config.yaml', + }, + }, + }).alertmanager.secret diff --git a/examples/alertmanager-config.jsonnet b/examples/alertmanager-config.jsonnet new file mode 100644 index 00000000..162104d7 --- /dev/null +++ b/examples/alertmanager-config.jsonnet @@ -0,0 +1,22 @@ +((import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + alertmanager+: { + config: ||| + global: + resolve_timeout: 10m + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' + receivers: + - name: 'null' + |||, + }, + }, + }).alertmanager.secret diff --git a/examples/alertmanager-config.yaml b/examples/alertmanager-config.yaml new file mode 100644 index 00000000..78c65b64 --- /dev/null +++ b/examples/alertmanager-config.yaml @@ -0,0 +1,15 @@ +# external alertmanager yaml +global: + resolve_timeout: 10m +route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' +receivers: +- name: 'null' diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 473f89d3..27bc2398 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -1,7 +1,5 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; -local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n routes:\n - match:\n alertname: DeadMansSwitch\n receiver: 'null'\nreceivers:\n- name: 'null'\n"; - { _config+:: { namespace: 'default', @@ -16,7 +14,22 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by alertmanager+:: { name: $._config.alertmanager.name, - config: alertmanagerConfig, + config: ||| + global: + resolve_timeout: 5m + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' + receivers: + - name: 'null' + |||, replicas: 3, }, }, diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 07155d97..4a143fbb 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo= + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== kind: Secret metadata: name: alertmanager-main From f81bdd892815a03fd14302f1b2c72fde9969aa08 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 4 Jun 2018 16:29:40 +0200 Subject: [PATCH 293/638] kube-prometheus: Use 1 ConfigMap per Grafana dashboard --- .../kube-prometheus/kube-prometheus.libsonnet | 4 + manifests/grafana-dashboardDatasources.yaml | 2 +- manifests/grafana-dashboardDefinitions.yaml | 13871 ++++++++-------- manifests/grafana-deployment.yaml | 50 +- 4 files changed, 7009 insertions(+), 6918 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 6c1636de..4402ca96 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -1,4 +1,5 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local configMapList = k.core.v1.configMapList; (import 'grafana/grafana.libsonnet') + (import 'kube-state-metrics/kube-state-metrics.libsonnet') + @@ -12,6 +13,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; kubePrometheus+:: { namespace: k.core.v1.namespace.new($._config.namespace), }, + grafana+:: { + dashboardDefinitions: configMapList.new(super.dashboardDefinitions), + }, } + { _config+:: { namespace: 'default', diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 5ed25a02..70872c84 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -5,7 +5,7 @@ data: "datasources": [ { "access": "proxy", - "etitable": false, + "editable": false, "name": "prometheus", "org_id": 1, "type": "prometheus", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 7f6bc8f5..6f04bb1f 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1,6914 +1,6959 @@ apiVersion: v1 -data: - k8s-cluster-rsrc-use.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 0, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (Load1)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_utilisation:ratio", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_swap_io_bytes:sum_rate", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Saturation (Swap I/O)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Saturation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_utilisation:sum_irate", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Utilisation (Transmitted)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_saturation:sum_irate", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Saturation (Dropped)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_avail{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Capacity", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Storage", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "K8s / USE Method / Cluster", - "version": 0 - } - k8s-node-rsrc-use.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 0, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (Load1)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_utilisation:{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Memory", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Swap IO", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Saturation (Swap I/O)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Saturation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Utilisation (Transmitted)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Saturation (Dropped)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Net", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "1 - sum(max by (device, node) (node_filesystem_avail{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Disk", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Utilisation", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "node", - "multi": false, - "name": "node", - "options": [ - - ], - "query": "label_values(kube_node_info, node)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "K8s / USE Method / Node", - "version": 0 - } - k8s-resources-cluster.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "100px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 0, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "CPU Requests Commitment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "CPU Limits Commitment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "Memory Requests Commitment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(node_memory_MemTotal)", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "Memory Limits Commitment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Headlines", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_cpu_usage_seconds_total[1m])) by (namespace)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{namespace}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", - "pattern": "namespace", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_rss) by (namespace)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{namespace}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage (w/o cache)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "decbytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", - "pattern": "namespace", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(container_memory_rss) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Requests by Namespace", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Requests", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "K8s / Compute Resources / Cluster", - "version": 0 - } - k8s-resources-namespace.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 0, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[1m])) by (pod_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod_name}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\"}) by (pod_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod_name}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "K8s / Compute Resources / Namespace", - "version": 0 - } - k8s-resources-pod.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 0, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{container_name}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Container", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "container", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{container_name}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "decbytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Container", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "container", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "K8s / Compute Resources / Pod", - "version": 0 - } - nodes.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])) * 100)\n", - "format": "time_series", - "intervalFactor": 10, - "legendFormat": "{{cpu}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Idle CPU", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": 100, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"} * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 1m", - "refId": "A" - }, - { - "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"} * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 5m", - "refId": "B" - }, - { - "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"} * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 15m", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory used", - "refId": "A" - }, - { - "expr": "node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "refId": "B" - }, - { - "expr": "node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory cached", - "refId": "C" - }, - { - "expr": "node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory free", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n) * 100\n /\nnode_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "read", - "refId": "A" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "written", - "refId": "B" - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "io time", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_avail{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Disk Space Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Received", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Transmitted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "instance", - "options": [ - - ], - "query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Nodes", - "version": 0 - } - pods.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current: {{ container_name }}", - "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info, namespace)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [ - - ], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Pods", - "version": 0 - } - statefulset.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "CPU", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "Bps", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Network", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Desired Replicas", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Replicas of current version", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Observed Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "", - "title": "Metadata Generation", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas specified", - "refId": "A" - }, - { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas created", - "refId": "B" - }, - { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "ready", - "refId": "C" - }, - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas of current version", - "refId": "D" - }, - { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "E" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Name", - "multi": false, - "name": "statefulset", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", namespace=\"$namespace\"}, statefulset)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "StatefulSets", - "version": 0 - } -kind: ConfigMap -metadata: - name: grafana-dashboard-definitions - namespace: monitoring +items: +- apiVersion: v1 + data: + k8s-cluster-rsrc-use.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:ratio", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Network", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_avail{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Capacity", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / USE Method / Cluster", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-cluster-rsrc-use + namespace: monitoring +- apiVersion: v1 + data: + k8s-node-rsrc-use.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Memory", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Swap IO", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Net", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - sum(max by (device, node) (node_filesystem_avail{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Disk", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "node", + "multi": false, + "name": "node", + "options": [ + + ], + "query": "label_values(kube_node_info, node)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / USE Method / Node", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-node-rsrc-use + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-cluster.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "100px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(node_memory_MemTotal)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total[1m])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_rss) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage (w/o cache)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(container_memory_rss) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests by Namespace", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Requests", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / Compute Resources / Cluster", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-cluster + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-namespace.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[1m])) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\"}) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / Compute Resources / Namespace", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-namespace + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-pod.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "K8s / Compute Resources / Pod", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-pod + namespace: monitoring +- apiVersion: v1 + data: + nodes.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])) * 100)\n", + "format": "time_series", + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Idle CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 1m", + "refId": "A" + }, + { + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 5m", + "refId": "B" + }, + { + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 15m", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n) * 100\n /\nnode_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "written", + "refId": "B" + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "io time", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_avail{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Disk Space Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-nodes + namespace: monitoring +- apiVersion: v1 + data: + pods.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current: {{ container_name }}", + "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [ + + ], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-pods + namespace: monitoring +- apiVersion: v1 + data: + statefulset.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "Bps", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Replicas of current version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas specified", + "refId": "A" + }, + { + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas created", + "refId": "B" + }, + { + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "ready", + "refId": "C" + }, + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas of current version", + "refId": "D" + }, + { + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "E" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Name", + "multi": false, + "name": "statefulset", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", namespace=\"$namespace\"}, statefulset)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "StatefulSets", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-statefulset + namespace: monitoring +kind: ConfigMapList diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 814e98cb..8002d832 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -38,8 +38,29 @@ spec: - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false - - mountPath: /grafana-dashboard-definitions/0 - name: grafana-dashboard-definitions + - mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use + name: grafana-dashboard-k8s-cluster-rsrc-use + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-node-rsrc-use + name: grafana-dashboard-k8s-node-rsrc-use + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster + name: grafana-dashboard-k8s-resources-cluster + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace + name: grafana-dashboard-k8s-resources-namespace + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod + name: grafana-dashboard-k8s-resources-pod + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/nodes + name: grafana-dashboard-nodes + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/pods + name: grafana-dashboard-pods + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/statefulset + name: grafana-dashboard-statefulset readOnly: false securityContext: runAsNonRoot: true @@ -55,5 +76,26 @@ spec: name: grafana-dashboards name: grafana-dashboards - configMap: - name: grafana-dashboard-definitions - name: grafana-dashboard-definitions + name: grafana-dashboard-k8s-cluster-rsrc-use + name: grafana-dashboard-k8s-cluster-rsrc-use + - configMap: + name: grafana-dashboard-k8s-node-rsrc-use + name: grafana-dashboard-k8s-node-rsrc-use + - configMap: + name: grafana-dashboard-k8s-resources-cluster + name: grafana-dashboard-k8s-resources-cluster + - configMap: + name: grafana-dashboard-k8s-resources-namespace + name: grafana-dashboard-k8s-resources-namespace + - configMap: + name: grafana-dashboard-k8s-resources-pod + name: grafana-dashboard-k8s-resources-pod + - configMap: + name: grafana-dashboard-nodes + name: grafana-dashboard-nodes + - configMap: + name: grafana-dashboard-pods + name: grafana-dashboard-pods + - configMap: + name: grafana-dashboard-statefulset + name: grafana-dashboard-statefulset From 1d36d01b975cadd5023a1853babceb54ff0be362 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Fri, 8 Jun 2018 17:42:27 +0200 Subject: [PATCH 294/638] *: Refactor build system - Move prometheus-config-reloader to cmd/ - Refactor Makefile & contrib/kube-prometheus/Makefile - Only execute a target if its dependencies changed - Create empty target file for docker builds - Replace promu with plain static `go build` --- Makefile | 61 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index fc1e7973..9cd7f86f 100644 --- a/Makefile +++ b/Makefile @@ -1,26 +1,61 @@ JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-style s -image: - docker build -f ../../scripts/jsonnet/Dockerfile -t po-jsonnet ../../ +JB_BINARY:=$(GOPATH)/bin/jb +EMBEDMD_BINARY:=$(GOPATH)/bin/embedmd -generate: image +# edit 1 + +all: generate fmt test + +../../hack/jsonnet-docker-image: ../../scripts/jsonnet/Dockerfile +# Create empty target file, for the sole purpose of recording when this target +# was last executed via the last-modification timestamp on the file. See +# https://www.gnu.org/software/make/manual/make.html#Empty-Targets + docker build -f - -t po-jsonnet . < ../../scripts/jsonnet/Dockerfile + touch $@ + +generate-in-docker: ../../hack/jsonnet-docker-image @echo ">> Compiling assets and generating Kubernetes manifests" - docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make generate-raw + docker run \ + --rm \ + -u=$(shell id -u $(USER)):$(shell id -g $(USER)) \ + -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ \ + --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ + po-jsonnet make generate -generate-raw: fmt - jb install +generate: manifests/** **.md + +**.md: $(EMBEDMD_BINARY) $(shell find examples) build.sh example.jsonnet + echo '>>> inside **.md' + $(EMBEDMD_BINARY) -w `find . -name "*.md" | grep -v vendor` + +manifests/**: vendor/** example.jsonnet ./build.sh +vendor/**: $(JB_BINARY) jsonnetfile.json + $(JB_BINARY) install + fmt: find . -name 'vendor' -prune -o -name '*.libsonnet' -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNET_FMT) -i -test: image - @echo ">> Compiling assets and generating Kubernetes manifests" - docker run --rm -u=$(shell id -u $(USER)):$(shell id -g $(USER)) -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus po-jsonnet make test-raw - -test-raw: crdtojsonnet - jb install +test: $(JB_BINARY) + $(JB_BINARY) install ./test.sh -.PHONY: image generate crdtojsonnet generate-raw test test-raw fmt +test-in-docker: ../../hack/jsonnet-docker-image + @echo ">> Compiling assets and generating Kubernetes manifests" + docker run \ + --rm \ + -u=$(shell id -u $(USER)):$(shell id -g $(USER)) \ + -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ \ + --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ + po-jsonnet make test + +$(JB_BINARY): + go get -u github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb + +$(EMBEDMD_BINARY): + go get github.com/campoy/embedmd + +.PHONY: generate generate-in-docker test test-in-docker fmt From ba506ced746809d0f571da1bc5706f49eb540881 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 14 Jun 2018 18:57:00 +0200 Subject: [PATCH 295/638] kube-prometheus: Update grafana jsonnet dependency --- manifests/grafana-dashboardDatasources.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 70872c84..5585057c 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -2,12 +2,13 @@ apiVersion: v1 data: prometheus.yaml: |- { + "apiVersion": 1, "datasources": [ { "access": "proxy", "editable": false, "name": "prometheus", - "org_id": 1, + "orgId": 1, "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090", "version": 1 From 603771ff144b43c7f551679c526fe4f5c256cd14 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 14 Jun 2018 15:09:08 +0200 Subject: [PATCH 296/638] kube-prometheus: Run node-exporter in host network Node exporter needs to run in the host network, not in the pod network in order to pick up network metrics of the node. --- .../kube-prometheus/node-exporter/node-exporter.libsonnet | 6 ++++-- manifests/grafana-dashboardDatasources.yaml | 3 ++- manifests/node-exporter-daemonset.yaml | 3 +++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 2d012110..a3e8d7a7 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -90,7 +90,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--secure-listen-address=:9100', '--upstream=http://127.0.0.1:9101/', ]) + - container.withPorts(containerPort.newNamed('https', 9100)) + + container.withPorts(containerPort.new(9100) + containerPort.withHostPort(9100) + containerPort.withName('https')) + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); @@ -108,7 +108,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume]) + daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter'), + daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') + + daemonset.mixin.spec.template.spec.withHostPid(true) + + daemonset.mixin.spec.template.spec.withHostNetwork(true), serviceAccount: local serviceAccount = k.core.v1.serviceAccount; diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 70872c84..5585057c 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -2,12 +2,13 @@ apiVersion: v1 data: prometheus.yaml: |- { + "apiVersion": 1, "datasources": [ { "access": "proxy", "editable": false, "name": "prometheus", - "org_id": 1, + "orgId": 1, "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090", "version": 1 diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 8488735c..92182e05 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -42,6 +42,7 @@ spec: name: kube-rbac-proxy ports: - containerPort: 9100 + hostPort: 9100 name: https resources: limits: @@ -50,6 +51,8 @@ spec: requests: cpu: 10m memory: 20Mi + hostNetwork: true + hostPID: true nodeSelector: beta.kubernetes.io/os: linux securityContext: From 4b6a761dc5d1d66884bc6abb0b7f2b818e2bfc56 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 18 Jun 2018 10:56:38 +0200 Subject: [PATCH 297/638] kube-prometheus: Update kubernetes monitoring mixing --- manifests/grafana-dashboardDefinitions.yaml | 34 ++++++---- manifests/prometheus-rules.yaml | 73 +++++++++++++++++++-- 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 6f04bb1f..1c973836 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -64,7 +64,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -150,7 +150,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -248,7 +248,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -334,7 +334,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -432,7 +432,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -518,7 +518,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -616,7 +616,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -702,7 +702,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -800,7 +800,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -909,6 +909,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Cluster", + "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } kind: ConfigMap @@ -1851,6 +1852,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Node", + "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } kind: ConfigMap @@ -2468,7 +2470,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2828,7 +2830,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3000,6 +3002,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Cluster", + "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } kind: ConfigMap @@ -3269,7 +3272,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3629,7 +3632,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3828,6 +3831,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Namespace", + "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } kind: ConfigMap @@ -4683,6 +4687,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Pod", + "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } kind: ConfigMap @@ -5609,6 +5614,7 @@ items: }, "timezone": "browser", "title": "Nodes", + "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } kind: ConfigMap @@ -6098,6 +6104,7 @@ items: }, "timezone": "browser", "title": "Pods", + "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } kind: ConfigMap @@ -6950,6 +6957,7 @@ items: }, "timezone": "browser", "title": "StatefulSets", + "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } kind: ConfigMap diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index cca1c735..75d5f36e 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -202,21 +202,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -268,6 +268,7 @@ spec: - alert: AlertmanagerDown annotations: message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | absent(up{job="alertmanager-main"} == 1) for: 15m @@ -276,6 +277,7 @@ spec: - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown expr: | absent(up{job="apiserver"} == 1) for: 15m @@ -284,6 +286,7 @@ spec: - alert: KubeControllerManagerDown annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown expr: | absent(up{job="kube-controller-manager"} == 1) for: 15m @@ -292,6 +295,7 @@ spec: - alert: KubeSchedulerDown annotations: message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown expr: | absent(up{job="kube-scheduler"} == 1) for: 15m @@ -300,6 +304,7 @@ spec: - alert: KubeStateMetricsDown annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown expr: | absent(up{job="kube-state-metrics"} == 1) for: 15m @@ -308,6 +313,7 @@ spec: - alert: KubeletDown annotations: message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown expr: | absent(up{job="kubelet"} == 1) for: 15m @@ -316,6 +322,7 @@ spec: - alert: NodeExporterDown annotations: message: NodeExporter has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown expr: | absent(up{job="node-exporter"} == 1) for: 15m @@ -324,6 +331,7 @@ spec: - alert: PrometheusDown annotations: message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | absent(up{job="prometheus-k8s"} == 1) for: 15m @@ -332,6 +340,7 @@ spec: - alert: PrometheusOperatorDown annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | absent(up{job="prometheus-operator"} == 1) for: 15m @@ -343,6 +352,7 @@ spec: annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} / second' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 for: 1h @@ -351,6 +361,7 @@ spec: - alert: KubePodNotReady annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 for: 1h @@ -360,6 +371,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} != @@ -371,6 +383,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | kube_deployment_spec_replicas{job="kube-state-metrics"} != @@ -382,6 +395,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != @@ -393,6 +407,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} != @@ -404,6 +419,7 @@ spec: annotations: message: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} / @@ -415,6 +431,7 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are not scheduled. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - @@ -426,17 +443,48 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking + more than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: | + time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than + 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: | + kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) / @@ -450,6 +498,7 @@ spec: annotations: message: Overcommited Memory resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / @@ -464,6 +513,7 @@ spec: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) / @@ -475,6 +525,7 @@ spec: - alert: KubeMemOvercommit annotations: message: Overcommited Memory resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) / @@ -487,6 +538,7 @@ spec: annotations: message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) @@ -502,6 +554,7 @@ spec: message: The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / @@ -515,6 +568,7 @@ spec: message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is expected to fill up within four days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 for: 5m @@ -525,6 +579,7 @@ spec: - alert: KubeNodeNotReady annotations: message: '{{ $labels.node }} has been unready for more than an hour' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 1h @@ -534,6 +589,7 @@ spec: annotations: message: There are {{ $value }} different versions of Kubernetes components running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 for: 1h @@ -543,6 +599,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 / @@ -555,6 +612,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 for: 15m @@ -564,6 +622,7 @@ spec: annotations: message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | kubelet_running_pod_count{job="kubelet"} > 100 for: 15m @@ -573,6 +632,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m @@ -582,6 +642,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m @@ -590,6 +651,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -600,6 +662,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -610,6 +673,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: @@ -617,6 +681,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 1 day. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: From e301f7f31a218116351d7bebb43f84b41dfa6655 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 14 Jun 2018 18:47:15 +0200 Subject: [PATCH 298/638] *: Add Thanos integration --- experimental/thanos/prometheus-self.yaml | 73 ----------------------- experimental/thanos/query.yaml | 51 ---------------- experimental/thanos/thanos-peers-svc.yaml | 14 ----- 3 files changed, 138 deletions(-) delete mode 100644 experimental/thanos/prometheus-self.yaml delete mode 100644 experimental/thanos/query.yaml delete mode 100644 experimental/thanos/thanos-peers-svc.yaml diff --git a/experimental/thanos/prometheus-self.yaml b/experimental/thanos/prometheus-self.yaml deleted file mode 100644 index e778905a..00000000 --- a/experimental/thanos/prometheus-self.yaml +++ /dev/null @@ -1,73 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: self - labels: - prometheus: self -spec: - podMetadata: - labels: - thanos-peer: 'true' - replicas: 2 - version: v2.2.1 - serviceAccountName: prometheus-k8s - serviceMonitorSelector: - matchLabels: - app: prometheus - ruleSelector: - matchLabels: - role: prometheus-rulefiles - prometheus: k8s - resources: - requests: - # 2Gi is default, but won't schedule if you don't have a node with >2Gi - # memory. Modify based on your target and time-series count for - # production use. This value is mainly meant for demonstration/testing - # purposes. - memory: 400Mi - containers: - - name: thanos - image: improbable/thanos:latest - args: - - "sidecar" - - "--log.level=debug" - - "--cluster.peers=thanos-peers.default.svc:10900" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: prometheus - labels: - app: prometheus -spec: - selector: - matchLabels: - app: prometheus - endpoints: - - port: web - interval: 30s ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: prometheus - prometheus: self - name: prometheus-self -spec: - type: NodePort - ports: - - name: web - nodePort: 30900 - port: 9090 - protocol: TCP - targetPort: web - selector: - prometheus: self diff --git a/experimental/thanos/query.yaml b/experimental/thanos/query.yaml deleted file mode 100644 index eb1d99ba..00000000 --- a/experimental/thanos/query.yaml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: thanos-query - labels: - app: thanos-query - thanos-peer: "true" -spec: - replicas: 2 - selector: - matchLabels: - app: thanos-query - thanos-peer: "true" - template: - metadata: - labels: - app: thanos-query - thanos-peer: "true" - spec: - containers: - - name: thanos-query - image: improbable/thanos:latest - args: - - "query" - - "--log.level=debug" - - "--query.replica-label=prometheus_replica" - - "--cluster.peers=thanos-peers.default.svc:10900" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: thanos-query - name: thanos-query -spec: - type: NodePort - selector: - app: thanos-query - ports: - - port: 9090 - protocol: TCP - targetPort: http - name: http-query - nodePort: 31111 \ No newline at end of file diff --git a/experimental/thanos/thanos-peers-svc.yaml b/experimental/thanos/thanos-peers-svc.yaml deleted file mode 100644 index afcfcfe4..00000000 --- a/experimental/thanos/thanos-peers-svc.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: thanos-peers -spec: - type: ClusterIP - clusterIP: None - ports: - - name: cluster - port: 10900 - targetPort: cluster - selector: - # Useful endpoint for gathering all thanos components for common gossip cluster. - thanos-peer: "true" \ No newline at end of file From 631299808056c75e7c4d66ca9199085ed1b717ed Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 25 Jun 2018 11:02:26 +0200 Subject: [PATCH 299/638] kube-prometheus: Bump various versions --- .../alertmanager/alertmanager.libsonnet | 2 +- .../kube-state-metrics.libsonnet | 4 +- .../node-exporter/node-exporter.libsonnet | 2 +- .../prometheus/prometheus.libsonnet | 2 +- ...r-0prometheusCustomResourceDefinition.yaml | 71 +++++++++++++++++++ manifests/alertmanager-alertmanager.yaml | 2 +- manifests/kube-state-metrics-deployment.yaml | 6 +- manifests/node-exporter-daemonset.yaml | 2 +- manifests/prometheus-prometheus.yaml | 2 +- 9 files changed, 82 insertions(+), 11 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 27bc2398..250e7bd7 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.14.0', + alertmanager: 'v0.15.0', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 5fe1c074..c36f293b 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -5,8 +5,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - kubeStateMetrics: 'v1.3.0', - kubeRbacProxy: 'v0.3.0', + kubeStateMetrics: 'v1.3.1', + kubeRbacProxy: 'v0.3.1', addonResizer: '1.0', }, diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index a3e8d7a7..c51347a3 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { nodeExporter: 'v0.15.2', - kubeRbacProxy: 'v0.3.0', + kubeRbacProxy: 'v0.3.1', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index f1758cf7..e84986f5 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.2.1', + prometheus: 'v2.3.1', }, imageRepos+:: { diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index f4d73c22..923344f2 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2672,6 +2672,77 @@ spec: phase: description: Phase represents the current phase of PersistentVolumeClaim. type: string + thanos: + description: ThanosSpec defines parameters for a Prometheus server within + a Thanos deployment. + properties: + baseImage: + description: Thanos base image if other than default. + type: string + gcs: + description: ThanosGCSSpec defines parameters for use of Google + Cloud Storage (GCS) with Thanos. + properties: + bucket: + description: Google Cloud Storage bucket name for stored blocks. + If empty it won't store any block inside Google Cloud Storage. + type: string + peers: + description: Peers is a DNS name for Thanos to discover peers through. + type: string + s3: + description: ThanosSpec defines parameters for of AWS Simple Storage + Service (S3) with Thanos. (S3 compatible services apply as well) + properties: + accessKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bucket: + description: S3-Compatible API bucket name for stored blocks. + type: string + endpoint: + description: S3-Compatible API endpoint for stored blocks. + type: string + insecure: + description: Whether to use an insecure connection with an S3-Compatible + API. + type: boolean + secretKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + signatureVersion2: + description: Whether to use S3 Signature Version 2; otherwise + Signature Version 4 will be used. + type: boolean + version: + description: Version describes the version of Thanos to use. + type: string tolerations: description: If specified, the pod's tolerations. items: diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 2a8daa8d..bdc115b9 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -11,4 +11,4 @@ spec: beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main - version: v0.14.0 + version: v0.15.0 diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index fb2a8b5f..c7bb25c6 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --secure-listen-address=:8443 - --upstream=http://127.0.0.1:8081/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy-main ports: - containerPort: 8443 @@ -34,7 +34,7 @@ spec: - args: - --secure-listen-address=:9443 - --upstream=http://127.0.0.1:8082/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy-self ports: - containerPort: 9443 @@ -51,7 +51,7 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.3.0 + image: quay.io/coreos/kube-state-metrics:v1.3.1 name: kube-state-metrics resources: limits: diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 92182e05..f7c9ebb5 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -38,7 +38,7 @@ spec: - args: - --secure-listen-address=:9100 - --upstream=http://127.0.0.1:9101/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy ports: - containerPort: 9100 diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index b7fe9f25..9a7448b6 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -27,4 +27,4 @@ spec: matchExpressions: - key: k8s-app operator: Exists - version: v2.2.1 + version: v2.3.1 From 22066b47086fa132a10eba424e1ac347de5b76bb Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 26 Jun 2018 00:04:01 +0800 Subject: [PATCH 300/638] kube-prometheus/Makefile: Change target output from files to folder Previously the `manifests` and `vendor` folder content has been build via `manifests/**` and `vendor/**` target definitions. This results in two issues: 1. The GNU Make recipe is executed for every single file in the folders. 2. Not all timestamps inside the `manifests` folder would be updated on every run, thereby needing to be rerun on all following target executions. Solution: 1. Define the target based on the folder, not individual files 2. Remove target folder before execution recipe to ensure all timestamps are updated. --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 9cd7f86f..bab98bcb 100644 --- a/Makefile +++ b/Makefile @@ -23,16 +23,18 @@ generate-in-docker: ../../hack/jsonnet-docker-image --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ po-jsonnet make generate -generate: manifests/** **.md +generate: manifests **.md **.md: $(EMBEDMD_BINARY) $(shell find examples) build.sh example.jsonnet echo '>>> inside **.md' $(EMBEDMD_BINARY) -w `find . -name "*.md" | grep -v vendor` -manifests/**: vendor/** example.jsonnet +manifests: vendor example.jsonnet + rm -rf manifests ./build.sh -vendor/**: $(JB_BINARY) jsonnetfile.json +vendor: $(JB_BINARY) jsonnetfile.json + rm -rf vendor $(JB_BINARY) install fmt: From 509f33eaeeb709f4617331662763cb7bb144796a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 26 Jun 2018 16:03:02 +0200 Subject: [PATCH 301/638] Remote write queue config (#1517) RemoteWriteSpec Expose QueueConfig Parameters (#1488) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9cd7f86f..0e749190 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ generate: manifests/** **.md echo '>>> inside **.md' $(EMBEDMD_BINARY) -w `find . -name "*.md" | grep -v vendor` -manifests/**: vendor/** example.jsonnet +manifests/**: vendor/** $(wildcard jsonnet/**/*) example.jsonnet ./build.sh vendor/**: $(JB_BINARY) jsonnetfile.json From f434ee9ae915a2b893c761c78764f6f39dc3d68b Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 27 Jun 2018 10:24:33 +0200 Subject: [PATCH 302/638] kube-prometheus: Regenerate prometheus CRD --- ...r-0prometheusCustomResourceDefinition.yaml | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 923344f2..1af66f0f 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1801,6 +1801,42 @@ spec: proxyUrl: description: Optional ProxyURL type: string + queueConfig: + description: QueueConfig allows the tuning of remote_write queue_config + parameters. This object is referenced in the RemoteWriteSpec + object. + properties: + batchSendDeadline: + description: BatchSendDeadline is the maximum time a sample + will wait in buffer. + type: string + capacity: + description: Capacity is the number of samples to buffer per + shard before we start dropping them. + format: int32 + type: integer + maxBackoff: + description: MaxBackoff is the maximum retry delay. + type: string + maxRetries: + description: MaxRetries is the maximum number of times to + retry a batch on recoverable errors. + format: int32 + type: integer + maxSamplesPerSend: + description: MaxSamplesPerSend is the maximum number of samples + per send. + format: int32 + type: integer + maxShards: + description: MaxShards is the maximum number of shards, i.e. + amount of concurrency. + format: int32 + type: integer + minBackoff: + description: MinBackoff is the initial retry delay. Gets doubled + for every retry. + type: string remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string From fad7e6ed35de7bcd25881ee5d5a5e8570f17eda9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 28 Jun 2018 11:34:53 +0200 Subject: [PATCH 303/638] *: Bump version to v0.21.0 --- jsonnetfile.json | 10 ++++++++++ manifests/0prometheus-operator-deployment.yaml | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index b4ebb0f2..dc29c821 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -9,6 +9,16 @@ } }, "version": "." + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "../../", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "." } ] } \ No newline at end of file diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 3276d198..df15b954 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,8 +19,8 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.20.0 - image: quay.io/coreos/prometheus-operator:v0.20.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.21.0 + image: quay.io/coreos/prometheus-operator:v0.21.0 name: prometheus-operator ports: - containerPort: 8080 From dd834f3128e7b5e2d446c1f2b7fe675aba53459c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 28 Jun 2018 13:44:51 +0200 Subject: [PATCH 304/638] kube-prometheus: Re-generate dashboards --- manifests/grafana-dashboardDefinitions.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1c973836..a0dba292 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -6727,7 +6727,7 @@ items: "bars": false, "dashLength": 10, "dashes": false, - "datasource": "prometheus", + "datasource": "$datasource", "fill": 1, "gridPos": { @@ -6877,7 +6877,7 @@ items: "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "Namespace", @@ -6903,7 +6903,7 @@ items: "current": { }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "Name", From 04b7f2f110239ff3ce1996a0069028c92b44c6ec Mon Sep 17 00:00:00 2001 From: Or Sela Date: Thu, 28 Jun 2018 17:37:32 +0300 Subject: [PATCH 305/638] Fix missing and broken links in developing-prometheus-rules-and-grafana-dashboards doc --- docs/developing-prometheus-rules-and-grafana-dashboards.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index 9f1166ce..1eb4f15a 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -109,7 +109,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { ### Pre-rendered rules -We acknowledge, that users may need to transition existing rules, and therefore allow an option to add additional pre-rendered rules. This can be done simply by importing the existing rules in the [Prometheus rule format](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) using the jsonnet function `importstr`. In this example we are importing a [provided example rule](examples/example.rules.yaml). +We acknowledge, that users may need to transition existing rules, and therefore allow an option to add additional pre-rendered rules. This can be done simply by importing the existing rules in the [Prometheus rule format](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) using the jsonnet function `importstr`. In this example we are importing a [provided example rule](../examples/example.rules.yaml). [embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet) ```jsonnet @@ -139,7 +139,7 @@ Dashboards can either be added using jsonnet or simply a pre-rendered json dashb ### Jsonnet dashboard -We recommend using the [grafonnet]() library for jsonnet, which gives you a simple DSL to generate Grafana dashboards. Following the [Prometheus Monitoring Mixins proposal](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/) additional dashboards are added to the `grafanaDashboards` key, located in the top level object. To add new jsonnet dashboards, simply add one. +We recommend using the [grafonnet](https://github.com/grafana/grafonnet-lib/) library for jsonnet, which gives you a simple DSL to generate Grafana dashboards. Following the [Prometheus Monitoring Mixins proposal](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/) additional dashboards are added to the `grafanaDashboards` key, located in the top level object. To add new jsonnet dashboards, simply add one. > Note that dashboards can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file. From 2a96d77dd931274990e62cd431b16282e2eed064 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 28 Jun 2018 17:31:58 +0200 Subject: [PATCH 306/638] kube-prometheus: Re-generate Grafana jsonnet dependency --- manifests/grafana-dashboardDatasources.yaml | 19 +++---------------- manifests/grafana-deployment.yaml | 6 +++--- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 5585057c..446c6864 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -1,21 +1,8 @@ apiVersion: v1 data: - prometheus.yaml: |- - { - "apiVersion": 1, - "datasources": [ - { - "access": "proxy", - "editable": false, - "name": "prometheus", - "orgId": 1, - "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090", - "version": 1 - } - ] - } -kind: ConfigMap + prometheus.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0= +kind: Secret metadata: name: grafana-datasources namespace: monitoring +type: Opaque diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 8002d832..4b00b004 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -69,9 +69,9 @@ spec: volumes: - emptyDir: {} name: grafana-storage - - configMap: - name: grafana-datasources - name: grafana-datasources + - name: grafana-datasources + secret: + secretName: grafana-datasources - configMap: name: grafana-dashboards name: grafana-dashboards From 5f5664b3aa623d59a496c45193fccd0179ace4ee Mon Sep 17 00:00:00 2001 From: Tamal Saha Date: Sat, 30 Jun 2018 01:55:17 -0700 Subject: [PATCH 307/638] Update client libraries to Kubernetes 1.11 --- ...rometheus-operator-0alertmanagerCustomResourceDefinition.yaml | 1 + ...0prometheus-operator-0prometheusCustomResourceDefinition.yaml | 1 + ...metheus-operator-0prometheusruleCustomResourceDefinition.yaml | 1 + ...metheus-operator-0servicemonitorCustomResourceDefinition.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 560eb6ef..ea5b6663 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -2282,3 +2282,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 1af66f0f..1622e174 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2864,3 +2864,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index 43f98251..baba968c 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -339,3 +339,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index c6bc96a1..018a612d 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -238,3 +238,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null From cb504f678aeaefca7b25c9bce916eec5b015fd91 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Sun, 1 Jul 2018 12:18:47 +0200 Subject: [PATCH 308/638] *: Re-generate --- ...0alertmanagerCustomResourceDefinition.yaml | 103 ++++++++++++++++-- ...r-0prometheusCustomResourceDefinition.yaml | 103 ++++++++++++++++-- 2 files changed, 186 insertions(+), 20 deletions(-) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index ea5b6663..b95a4509 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -53,11 +53,46 @@ spec: properties: preference: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -88,8 +123,6 @@ spec: - key - operator type: array - required: - - matchExpressions weight: description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. @@ -110,11 +143,46 @@ spec: terms are ORed. items: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -145,8 +213,6 @@ spec: - key - operator type: array - required: - - matchExpressions type: array required: - nodeSelectorTerms @@ -1673,6 +1739,23 @@ spec: format: int64 type: integer type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 1622e174..3e11d52f 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -83,11 +83,46 @@ spec: properties: preference: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -118,8 +153,6 @@ spec: - key - operator type: array - required: - - matchExpressions weight: description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. @@ -140,11 +173,46 @@ spec: terms are ORed. items: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -175,8 +243,6 @@ spec: - key - operator type: array - required: - - matchExpressions type: array required: - nodeSelectorTerms @@ -2100,6 +2166,23 @@ spec: format: int64 type: integer type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. From 0de5a5c8cd19552ea4f117c52d6b5f619a3d79a2 Mon Sep 17 00:00:00 2001 From: Tamal Saha Date: Sat, 30 Jun 2018 01:55:17 -0700 Subject: [PATCH 309/638] Update client libraries to Kubernetes 1.11 --- ...0alertmanagerCustomResourceDefinition.yaml | 104 ++++++++++++++++-- ...r-0prometheusCustomResourceDefinition.yaml | 104 ++++++++++++++++-- ...rometheusruleCustomResourceDefinition.yaml | 1 + ...ervicemonitorCustomResourceDefinition.yaml | 1 + 4 files changed, 190 insertions(+), 20 deletions(-) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 560eb6ef..b95a4509 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -53,11 +53,46 @@ spec: properties: preference: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -88,8 +123,6 @@ spec: - key - operator type: array - required: - - matchExpressions weight: description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. @@ -110,11 +143,46 @@ spec: terms are ORed. items: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -145,8 +213,6 @@ spec: - key - operator type: array - required: - - matchExpressions type: array required: - nodeSelectorTerms @@ -1673,6 +1739,23 @@ spec: format: int64 type: integer type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. @@ -2282,3 +2365,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 1af66f0f..3e11d52f 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -83,11 +83,46 @@ spec: properties: preference: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -118,8 +153,6 @@ spec: - key - operator type: array - required: - - matchExpressions weight: description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. @@ -140,11 +173,46 @@ spec: terms are ORed. items: description: A null or empty node selector term matches - no objects. + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. properties: matchExpressions: - description: Required. A list of node selector requirements. - The requirements are ANDed. + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. items: description: A node selector requirement is a selector that contains values, a key, and an operator that @@ -175,8 +243,6 @@ spec: - key - operator type: array - required: - - matchExpressions type: array required: - nodeSelectorTerms @@ -2100,6 +2166,23 @@ spec: format: int64 type: integer type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. @@ -2864,3 +2947,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index 43f98251..baba968c 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -339,3 +339,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index c6bc96a1..018a612d 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -238,3 +238,4 @@ status: kind: "" plural: "" conditions: null + storedVersions: null From 171b7c41e1173a41b293adbe40341e1778e164c2 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 2 Jul 2018 10:38:38 +0200 Subject: [PATCH 310/638] *: Re-generate --- ...0alertmanagerCustomResourceDefinition.yaml | 22 +++++++++++++++++++ ...r-0prometheusCustomResourceDefinition.yaml | 22 +++++++++++++++++++ ...rometheusruleCustomResourceDefinition.yaml | 6 +++++ 3 files changed, 50 insertions(+) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index b95a4509..993cffc0 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1380,6 +1380,9 @@ spec: request. type: string creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. format: date-time type: string deletionGracePeriodSeconds: @@ -1389,6 +1392,9 @@ spec: format: int64 type: integer deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. format: date-time type: string finalizers: @@ -1871,6 +1877,10 @@ spec: it if set in create or update request. type: string creationTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. format: date-time type: string deletionGracePeriodSeconds: @@ -1881,6 +1891,10 @@ spec: format: int64 type: integer deletionTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. format: date-time type: string finalizers: @@ -2253,9 +2267,17 @@ spec: about state of pvc properties: lastProbeTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. format: date-time type: string lastTransitionTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. format: date-time type: string message: diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 3e11d52f..8e2f8074 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1469,6 +1469,9 @@ spec: request. type: string creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. format: date-time type: string deletionGracePeriodSeconds: @@ -1478,6 +1481,9 @@ spec: format: int64 type: integer deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. format: date-time type: string finalizers: @@ -2382,6 +2388,10 @@ spec: it if set in create or update request. type: string creationTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. format: date-time type: string deletionGracePeriodSeconds: @@ -2392,6 +2402,10 @@ spec: format: int64 type: integer deletionTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. format: date-time type: string finalizers: @@ -2764,9 +2778,17 @@ spec: about state of pvc properties: lastProbeTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. format: date-time type: string lastTransitionTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. format: date-time type: string message: diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index baba968c..b70cad30 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -40,6 +40,9 @@ spec: going to ignore it if set in create or update request. type: string creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of the + factory methods that the time package offers. format: date-time type: string deletionGracePeriodSeconds: @@ -49,6 +52,9 @@ spec: format: int64 type: integer deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of the + factory methods that the time package offers. format: date-time type: string finalizers: From 9c26600e0aa601b901204e3eb5a6061ae04e22d2 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 2 Jul 2018 16:17:39 +0200 Subject: [PATCH 311/638] *: Fix CRD generation --- ...0alertmanagerCustomResourceDefinition.yaml | 49 +++++++++++------ ...r-0prometheusCustomResourceDefinition.yaml | 54 ++++++++++++------- ...rometheusruleCustomResourceDefinition.yaml | 9 ---- ...ervicemonitorCustomResourceDefinition.yaml | 14 ++--- 4 files changed, 72 insertions(+), 54 deletions(-) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 993cffc0..464ab8a0 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -11,7 +11,6 @@ spec: scope: Namespaced validation: openAPIV3Schema: - description: Alertmanager describes an Alertmanager cluster. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation @@ -815,7 +814,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -830,7 +832,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port preStop: @@ -882,7 +887,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -897,7 +905,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port livenessProbe: @@ -955,7 +966,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -986,7 +1000,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port timeoutSeconds: @@ -1095,7 +1112,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -1126,7 +1146,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port timeoutSeconds: @@ -2379,12 +2402,4 @@ spec: - updatedReplicas - availableReplicas - unavailableReplicas - required: - - spec version: v1 -status: - acceptedNames: - kind: "" - plural: "" - conditions: null - storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 8e2f8074..e2382fc3 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -11,7 +11,6 @@ spec: scope: Namespaced validation: openAPIV3Schema: - description: Prometheus defines a Prometheus deployment. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation @@ -640,7 +639,10 @@ spec: pathPrefix: description: Prefix for the HTTP path alerts are pushed to. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use when firing alerts. type: string @@ -898,7 +900,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -913,7 +918,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port preStop: @@ -965,7 +973,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -980,7 +991,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port livenessProbe: @@ -1038,7 +1052,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -1069,7 +1086,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port timeoutSeconds: @@ -1178,7 +1198,10 @@ spec: path: description: Path to access on the HTTP server. type: string - port: {} + port: + anyOf: + - type: string + - type: integer scheme: description: Scheme to use for connecting to the host. Defaults to HTTP. @@ -1209,7 +1232,10 @@ spec: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string - port: {} + port: + anyOf: + - type: string + - type: integer required: - port timeoutSeconds: @@ -2961,12 +2987,4 @@ spec: - updatedReplicas - availableReplicas - unavailableReplicas - required: - - spec version: v1 -status: - acceptedNames: - kind: "" - plural: "" - conditions: null - storedVersions: null diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index b70cad30..0a9873c9 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -11,7 +11,6 @@ spec: scope: Namespaced validation: openAPIV3Schema: - description: PrometheusRule defines alerting rules for a Prometheus instance properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation @@ -337,12 +336,4 @@ spec: - name - rules type: array - required: - - spec version: v1 -status: - acceptedNames: - kind: "" - plural: "" - conditions: null - storedVersions: null diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 018a612d..f3068cf8 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -11,7 +11,6 @@ spec: scope: Namespaced validation: openAPIV3Schema: - description: ServiceMonitor defines monitoring for a set of services. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation @@ -143,7 +142,10 @@ spec: scrapeTimeout: description: Timeout after which the scrape is ended type: string - targetPort: {} + targetPort: + anyOf: + - type: string + - type: integer tlsConfig: description: TLSConfig specifies TLS configuration parameters. properties: @@ -230,12 +232,4 @@ spec: required: - endpoints - selector - required: - - spec version: v1 -status: - acceptedNames: - kind: "" - plural: "" - conditions: null - storedVersions: null From b6e323ca4629d7a66ccc86236a7e0185cda5d70e Mon Sep 17 00:00:00 2001 From: luarx Date: Fri, 22 Jun 2018 17:03:01 +0200 Subject: [PATCH 312/638] Add a tip about compiling process --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 89686cf4..7ec73350 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,8 @@ jsonnet -J vendor -m manifests ${1-example.jsonnet} | xargs -I{} sh -c 'cat $1 | This script reads each key of the generated json and uses that as the file name, and writes the value of that key to that file. +> You can also run this script executing the command `make generate-raw` from kube-prometheus base directory of this repository but the above option it is recommended so that you run it in your own infrastructure repository. + ## Configuration A hidden `_config` field is located at the top level of the object this library provides. These are the available fields with their respective default values: From ad748858e3a185d0297030cb10bbe5b7204f77f5 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 26 Jun 2018 14:12:49 +0200 Subject: [PATCH 313/638] kube-prometheus: Add kube-prometheus mixin to configure static etcd --- README.md | 31 ++++++ examples/etcd-client-ca.crt | 0 examples/etcd-client.crt | 0 examples/etcd-client.key | 0 examples/etcd.jsonnet | 22 +++++ jsonnet/kube-prometheus/jsonnetfile.json | 10 ++ .../kube-prometheus-static-etcd.libsonnet | 95 +++++++++++++++++++ 7 files changed, 158 insertions(+) create mode 100644 examples/etcd-client-ca.crt create mode 100644 examples/etcd-client.crt create mode 100644 examples/etcd-client.key create mode 100644 examples/etcd.jsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet diff --git a/README.md b/README.md index 89686cf4..dd6e0b73 100644 --- a/README.md +++ b/README.md @@ -285,6 +285,37 @@ In the above example the configuration has been inlined, but can just as well be }, }).alertmanager.secret ``` +### Static etcd configuration + +In order to configure a static etcd cluster to scrape there is a simple mixin prepared, so only the IPs and certificate information need to be configured. Simply append the `kube-prometheus/kube-prometheus-static-etcd.libsonnet` mixin to the rest of the configuration, and configure the `ips` to be the IPs to scrape, and the `clientCA`, `clientKey` and `clientCert` to values that are valid to scrape etcd metrics with. + +Most likely these certificates are generated somewhere in an infrastructure repository, so using the jsonnet `importstr` function can be useful here. All the sensitive information on the certificates will end up in a Kubernetes Secret. + +[embedmd]:# (examples/etcd.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + etcd+:: { + ips: ['127.0.0.1'], + clientCA: importstr 'etcd-client-ca.crt', + clientKey: importstr 'etcd-client.key', + clientCert: importstr 'etcd-client.crt', + serverName: 'etcd.my-cluster.local', + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` ### Customizing Prometheus alerting/recording rules and Grafana dashboards diff --git a/examples/etcd-client-ca.crt b/examples/etcd-client-ca.crt new file mode 100644 index 00000000..e69de29b diff --git a/examples/etcd-client.crt b/examples/etcd-client.crt new file mode 100644 index 00000000..e69de29b diff --git a/examples/etcd-client.key b/examples/etcd-client.key new file mode 100644 index 00000000..e69de29b diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet new file mode 100644 index 00000000..c521d1cd --- /dev/null +++ b/examples/etcd.jsonnet @@ -0,0 +1,22 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + etcd+:: { + ips: ['127.0.0.1'], + clientCA: importstr 'etcd-client-ca.crt', + clientKey: importstr 'etcd-client.key', + clientCert: importstr 'etcd-client.crt', + serverName: 'etcd.my-cluster.local', + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index a7b0a203..719f0e94 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -39,6 +39,16 @@ } }, "version": "master" + }, + { + "name": "etcd-mixin", + "source": { + "git": { + "remote": "https://github.com/coreos/etcd", + "subdir": "Documentation/etcd-mixin" + } + }, + "version": "master" } ] } \ No newline at end of file diff --git a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet new file mode 100644 index 00000000..23883c2c --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet @@ -0,0 +1,95 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +(import 'etcd-mixin/mixin.libsonnet') + { + _config+:: { + etcd: { + ips: [], + clientCA: null, + clientKey: null, + clientCert: null, + serverName: null, + }, + }, + prometheus+:: { + serviceEtcd: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local etcdServicePort = servicePort.newNamed('metrics', 2379, 2379); + + service.new('etcd', null, etcdServicePort) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) + + service.mixin.spec.withClusterIp('None'), + endpointsEtcd: + local endpoints = k.core.v1.endpoints; + local endpointSubset = endpoints.subsetsType; + local endpointPort = endpointSubset.portsType; + + local etcdPort = endpointPort.new() + + endpointPort.withName('metrics') + + endpointPort.withPort(2379) + + endpointPort.withProtocol('TCP'); + + local subset = endpointSubset.new() + + endpointSubset.withAddresses([ + { ip: etcdIP } + for etcdIP in $._config.etcd.ips + ]) + + endpointSubset.withPorts(etcdPort); + + endpoints.new() + + endpoints.mixin.metadata.withName('etcd') + + endpoints.mixin.metadata.withNamespace('kube-system') + + endpoints.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) + + endpoints.withSubsets(subset), + serviceMonitorEtcd: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'etcd', + namespace: 'kube-system', + labels: { + 'k8s-app': 'etcd', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'metrics', + interval: '30s', + scheme: 'https', + tlsConfig: { + caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt', + keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key', + certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt', + serverName: $._config.etcd.serverName, + }, + }, + ], + selector: { + matchLabels: { + 'k8s-app': 'etcd', + }, + }, + }, + }, + secretEtcdCerts: + local secret = k.core.v1.secret; + + secret.new('kube-etcd-client-certs', { + 'etcd-client-ca.crt': std.base64($._config.etcd.clientCA), + 'etcd-client.key': std.base64($._config.etcd.clientKey), + 'etcd-client.crt': std.base64($._config.etcd.clientCert), + }) + + secret.mixin.metadata.withNamespace($._config.namespace), + prometheus+: + { + spec+: { + secrets+: [$.prometheus.secretEtcdCerts.metadata.name], + }, + }, + }, +} From cd709826c9abd51887b5f496af0b585455b28a7c Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 18 Jun 2018 23:00:48 +0200 Subject: [PATCH 314/638] format: Introduce shellcheck [1] for shell script analysis [1] https://github.com/koalaman/shellcheck --- Makefile | 5 +---- README.md | 2 +- build.sh | 2 +- experimental/custom-metrics-api/gencerts.sh | 9 +++++---- test.sh | 7 ++++--- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index bab98bcb..74ccee14 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,6 @@ JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-s JB_BINARY:=$(GOPATH)/bin/jb EMBEDMD_BINARY:=$(GOPATH)/bin/embedmd -# edit 1 - all: generate fmt test ../../hack/jsonnet-docker-image: ../../scripts/jsonnet/Dockerfile @@ -26,10 +24,9 @@ generate-in-docker: ../../hack/jsonnet-docker-image generate: manifests **.md **.md: $(EMBEDMD_BINARY) $(shell find examples) build.sh example.jsonnet - echo '>>> inside **.md' $(EMBEDMD_BINARY) -w `find . -name "*.md" | grep -v vendor` -manifests: vendor example.jsonnet +manifests: vendor example.jsonnet build.sh rm -rf manifests ./build.sh diff --git a/README.md b/README.md index 5b2141c8..bf61d9ae 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ rm -rf manifests mkdir manifests # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests ${1-example.jsonnet} | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} +jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml; rm -f {}' -- {} ``` diff --git a/build.sh b/build.sh index 6037c3db..4eaf5ac5 100755 --- a/build.sh +++ b/build.sh @@ -9,5 +9,5 @@ rm -rf manifests mkdir manifests # optional, but we would like to generate yaml, not json -jsonnet -J vendor -m manifests ${1-example.jsonnet} | xargs -I{} sh -c 'cat $1 | gojsontoyaml > $1.yaml; rm -f $1' -- {} +jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml; rm -f {}' -- {} diff --git a/experimental/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh index 7cd8af93..b1e16031 100755 --- a/experimental/custom-metrics-api/gencerts.sh +++ b/experimental/custom-metrics-api/gencerts.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Detect if we are on mac or should use GNU base64 options -case `uname` in +case $(uname) in Darwin) b64_opts='-b=0' ;; @@ -17,7 +17,8 @@ echo '{"signing":{"default":{"expiry":"43800h","usages":["signing","key encipher export SERVICE_NAME=custom-metrics-apiserver export ALT_NAMES='"custom-metrics-apiserver.monitoring","custom-metrics-apiserver.monitoring.svc"' -echo '{"CN":"'${SERVICE_NAME}'","hosts":['${ALT_NAMES}'],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=metrics-ca.crt -ca-key=metrics-ca.key -config=metrics-ca-config.json - | cfssljson -bare apiserver +echo "{\"CN\":\"${SERVICE_NAME}\", \"hosts\": [${ALT_NAMES}], \"key\": {\"algo\": \"rsa\",\"size\": 2048}}" | \ + cfssl gencert -ca=metrics-ca.crt -ca-key=metrics-ca.key -config=metrics-ca-config.json - | cfssljson -bare apiserver cat <<-EOF > cm-adapter-serving-certs.yaml apiVersion: v1 @@ -25,6 +26,6 @@ kind: Secret metadata: name: cm-adapter-serving-certs data: - serving.crt: $(cat apiserver.pem | base64 ${b64_opts}) - serving.key: $(cat apiserver-key.pem | base64 ${b64_opts}) + serving.crt: $(base64 ${b64_opts} < apiserver.pem) + serving.key: $(base64 ${b64_opts} < apiserver-key.pem) EOF diff --git a/test.sh b/test.sh index dad4e75f..cfdf584c 100755 --- a/test.sh +++ b/test.sh @@ -8,7 +8,8 @@ for i in examples/jsonnet-snippets/*.jsonnet; do [ -f "$i" ] || break echo "Testing: ${i}" echo "" - snippet="local kp = $(<${i}); + fileContent=$(<"$i") + snippet="local kp = $fileContent; $( "test.jsonnet" @@ -25,8 +26,8 @@ for i in examples/*.jsonnet; do echo "Testing: ${i}" echo "" echo "\`\`\`" - echo "$(<${i})" + cat "${i}" echo "\`\`\`" echo "" - jsonnet -J vendor ${i} > /dev/null + jsonnet -J vendor "${i}" > /dev/null done From 9ec8ae65ee1cc201c7987efd65e466d281de674b Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 5 Jul 2018 11:27:37 +0200 Subject: [PATCH 315/638] *: Allow settings custom tags of container images --- ...us-operator-0alertmanagerCustomResourceDefinition.yaml | 4 ++++ ...heus-operator-0prometheusCustomResourceDefinition.yaml | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 464ab8a0..6f30397a 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -2325,6 +2325,10 @@ spec: phase: description: Phase represents the current phase of PersistentVolumeClaim. type: string + tag: + description: Tag of Alertmanager container image to be deployed. Defaults + to the value of `version`. + type: string tolerations: description: If specified, the pod's tolerations. items: diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index e2382fc3..140deffa 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2839,6 +2839,10 @@ spec: phase: description: Phase represents the current phase of PersistentVolumeClaim. type: string + tag: + description: Tag of Prometheus container image to be deployed. Defaults + to the value of `version`. + type: string thanos: description: ThanosSpec defines parameters for a Prometheus server within a Thanos deployment. @@ -2907,6 +2911,10 @@ spec: description: Whether to use S3 Signature Version 2; otherwise Signature Version 4 will be used. type: boolean + tag: + description: Tag of Thanos sidecar container image to be deployed. + Defaults to the value of `version`. + type: string version: description: Version describes the version of Thanos to use. type: string From 4083705edfe22a9e061bde48da0dec82be5347b6 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 9 Jul 2018 16:17:31 +0200 Subject: [PATCH 316/638] *: Cut 0.22.0 --- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index df15b954..faca5a84 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,8 +19,8 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.21.0 - image: quay.io/coreos/prometheus-operator:v0.21.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.0 + image: quay.io/coreos/prometheus-operator:v0.22.0 name: prometheus-operator ports: - containerPort: 8080 From b6e7d708c5822baa1aa713575c568e2279941390 Mon Sep 17 00:00:00 2001 From: seph Date: Fri, 13 Jul 2018 11:48:27 -0400 Subject: [PATCH 317/638] Configure kube-state-metrics As I work with kube-state-metrics in a large cluster, I found I needed to make some adjustments. - Expose the collectors, allowing one to configure exclusions. - Expose the addon_resizer parameters, facilitating reproduce adjustments - Allow adjusting scrapeTimeout and scrapeInterval --- README.md | 21 ++++++++++++ .../kube-state-metrics.libsonnet | 33 ++++++++++++++----- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index bf61d9ae..a9a42e54 100644 --- a/README.md +++ b/README.md @@ -369,3 +369,24 @@ The Prometheus `/targets` page will show the kubelet job with the error `403 Una #### Authorization problem The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. +### kube-state-metrics resource usaged + +In some environments, kube-state-metrics may need additional +resources. One driver for more resource needs, is a high number of +namespaces. There may be others. + +kube-state-metrics has it's resources using an +[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) +You can control it's parameters by setting variables in the +config. They default to: + +``` jsonnet + resizer+:: { + kubeStateMetrics+:: { + cpu: '100m', + extraCpu: '2m', + memory: '150Mi', + extraMemory: '30Mi', + }, + } +``` diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index c36f293b..f9065282 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -4,6 +4,22 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', + kubeStateMetrics+:: { + // when this is an empty string, you get the default set + collectors: '', + scrapeTimeout: '', + scrapeInterval: '30s', + }, + + resizer+:: { + kubeStateMetrics+:: { + cpu: '100m', + extraCpu: '2m', + memory: '150Mi', + extraMemory: '30Mi', + }, + }, + versions+:: { kubeStateMetrics: 'v1.3.1', kubeRbacProxy: 'v0.3.1', @@ -137,19 +153,20 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082', + '--collectors=' + $._config.kubeStateMetrics.collectors, ]) + - container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + - container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + container.mixin.resources.withRequests({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }) + + container.mixin.resources.withLimits({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }); local addonResizer = container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + container.withCommand([ '/pod_nanny', '--container=kube-state-metrics', - '--cpu=100m', - '--extra-cpu=2m', - '--memory=150Mi', - '--extra-memory=30Mi', + '--cpu=' + $._config.resizer.kubeStateMetrics.cpu, + '--extra-cpu=' + $._config.resizer.kubeStateMetrics.extraCpu, + '--memory=' + $._config.resizer.kubeStateMetrics.memory, + '--extra-memory=' + $._config.resizer.kubeStateMetrics.extraMemory, '--threshold=5', '--deployment=kube-state-metrics', ]) + @@ -258,13 +275,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { port: 'https-main', scheme: 'https', - interval: '30s', + interval: $._config.kubeStateMetrics.scrapeInterval, honorLabels: true, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', tlsConfig: { insecureSkipVerify: true, }, - }, + } + if $._config.kubeStateMetrics.scrapeTimeout != '' then { scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout } else {}, { port: 'https-self', scheme: 'https', From dabfca595bed9142a1850dca958a7e18037a18b2 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 17 Jul 2018 15:10:38 +0200 Subject: [PATCH 318/638] Makefile: Properly rebuild po-docgen on src changes (#1625) --- manifests/grafana-dashboardDefinitions.yaml | 34 +++++++++++++++++++-- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index a0dba292..1143970e 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2501,6 +2501,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2509,6 +2510,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2517,6 +2519,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2525,6 +2528,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2533,6 +2537,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -2861,6 +2866,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2869,6 +2875,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2877,6 +2884,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2885,6 +2893,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2893,6 +2902,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3303,6 +3313,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3311,6 +3322,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3319,6 +3331,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3327,6 +3340,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3335,6 +3349,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3663,6 +3678,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3671,6 +3687,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3679,6 +3696,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3687,6 +3705,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3695,6 +3714,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4132,6 +4152,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4140,6 +4161,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4148,6 +4170,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4156,6 +4179,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4164,6 +4188,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4492,6 +4517,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4500,6 +4526,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4508,6 +4535,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4516,6 +4544,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4524,6 +4553,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -5696,14 +5726,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 75d5f36e..49c4a995 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -388,7 +388,7 @@ spec: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} - for: 15m + for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch From efe686c0c09efd920ff6d60461842a8a1e2c48d9 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 17 Jul 2018 15:11:46 +0200 Subject: [PATCH 319/638] security: Enforce nobody user and read only / (#1393) * Make the Prometheus Operator Docker image run as `nobody` by default. * Disallow privilege escalation via K8s * Enforce read only root filesystem --- manifests/0prometheus-operator-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index faca5a84..5a193a35 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,6 +18,7 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet + - -logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.0 image: quay.io/coreos/prometheus-operator:v0.22.0 @@ -32,6 +33,9 @@ spec: requests: cpu: 100m memory: 50Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true nodeSelector: beta.kubernetes.io/os: linux securityContext: From 358c8477eae9429cf5a8b889c72176ca95caee6d Mon Sep 17 00:00:00 2001 From: seph Date: Tue, 17 Jul 2018 09:52:30 -0400 Subject: [PATCH 320/638] Resource config now in config.kubeStateMetrics As requested, this updates the resource specification to live directly in config.kubeStateMetrics It also clarifies the config variables. These names are what google uses in some of their tooling. (And a slight tweak to the way collectors are specified) --- README.md | 17 +++++----- .../kube-state-metrics.libsonnet | 32 ++++++++----------- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index a9a42e54..67cbd4c1 100644 --- a/README.md +++ b/README.md @@ -369,24 +369,23 @@ The Prometheus `/targets` page will show the kubelet job with the error `403 Una #### Authorization problem The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. -### kube-state-metrics resource usaged + +### kube-state-metrics resource usage In some environments, kube-state-metrics may need additional resources. One driver for more resource needs, is a high number of namespaces. There may be others. -kube-state-metrics has it's resources using an +kube-state-metrics resource allocation is managed by [addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) You can control it's parameters by setting variables in the config. They default to: ``` jsonnet - resizer+:: { - kubeStateMetrics+:: { - cpu: '100m', - extraCpu: '2m', - memory: '150Mi', - extraMemory: '30Mi', - }, + kubeStateMetrics+:: { + baseCPU: '100m', + cpuPerNode: '2m', + baseMemory: '150Mi', + memoryPerNode: '30Mi', } ``` diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index f9065282..59c0104a 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -5,19 +5,14 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', kubeStateMetrics+:: { - // when this is an empty string, you get the default set - collectors: '', - scrapeTimeout: '', + collectors: '', // empty string gets a default set scrapeInterval: '30s', - }, + scrapeTimeout: '', - resizer+:: { - kubeStateMetrics+:: { - cpu: '100m', - extraCpu: '2m', - memory: '150Mi', - extraMemory: '30Mi', - }, + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', }, versions+:: { @@ -153,20 +148,19 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082', - '--collectors=' + $._config.kubeStateMetrics.collectors, - ]) + - container.mixin.resources.withRequests({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }) + - container.mixin.resources.withLimits({ cpu: $._config.resizer.kubeStateMetrics.cpu, memory: $._config.resizer.kubeStateMetrics.memory }); + ] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) + + container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) + + container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }); local addonResizer = container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + container.withCommand([ '/pod_nanny', '--container=kube-state-metrics', - '--cpu=' + $._config.resizer.kubeStateMetrics.cpu, - '--extra-cpu=' + $._config.resizer.kubeStateMetrics.extraCpu, - '--memory=' + $._config.resizer.kubeStateMetrics.memory, - '--extra-memory=' + $._config.resizer.kubeStateMetrics.extraMemory, + '--cpu=' + $._config.kubeStateMetrics.baseCPU, + '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, + '--memory=' + $._config.kubeStateMetrics.baseMemory, + '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, '--threshold=5', '--deployment=kube-state-metrics', ]) + From 596b8697d005fa388515a675346ca054f29f17e2 Mon Sep 17 00:00:00 2001 From: seph Date: Tue, 17 Jul 2018 10:13:18 -0400 Subject: [PATCH 321/638] Set default scrape values We default to a 30s scrapeInterval, we may as well also set scrapeTimeout to the same. --- .../kube-state-metrics/kube-state-metrics.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 59c0104a..2805fc9d 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -7,7 +7,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; kubeStateMetrics+:: { collectors: '', // empty string gets a default set scrapeInterval: '30s', - scrapeTimeout: '', + scrapeTimeout: '30s', baseCPU: '100m', baseMemory: '150Mi', @@ -270,12 +270,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; port: 'https-main', scheme: 'https', interval: $._config.kubeStateMetrics.scrapeInterval, + scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, honorLabels: true, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', tlsConfig: { insecureSkipVerify: true, }, - } + if $._config.kubeStateMetrics.scrapeTimeout != '' then { scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout } else {}, + }, { port: 'https-self', scheme: 'https', From 04cf9ce35a6dd66006d1a58d2b7d720bf55efcef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 17 Jul 2018 19:49:42 +0200 Subject: [PATCH 322/638] *: Re-generate --- manifests/grafana-dashboardDefinitions.yaml | 34 +++++++++++++++++-- manifests/kube-state-metrics-deployment.yaml | 8 ++--- .../kube-state-metrics-serviceMonitor.yaml | 1 + manifests/prometheus-rules.yaml | 2 +- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index a0dba292..1143970e 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2501,6 +2501,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2509,6 +2510,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2517,6 +2519,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2525,6 +2528,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2533,6 +2537,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -2861,6 +2866,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2869,6 +2875,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2877,6 +2884,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2885,6 +2893,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2893,6 +2902,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3303,6 +3313,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3311,6 +3322,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3319,6 +3331,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3327,6 +3340,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3335,6 +3349,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3663,6 +3678,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3671,6 +3687,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3679,6 +3696,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3687,6 +3705,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3695,6 +3714,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4132,6 +4152,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4140,6 +4161,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4148,6 +4170,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4156,6 +4179,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4164,6 +4188,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4492,6 +4517,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4500,6 +4526,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4508,6 +4535,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4516,6 +4544,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4524,6 +4553,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -5696,14 +5726,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index c7bb25c6..065c87a9 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -55,11 +55,11 @@ spec: name: kube-state-metrics resources: limits: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi requests: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi - command: - /pod_nanny - --container=kube-state-metrics diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index 3d1073ad..2100449d 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -12,6 +12,7 @@ spec: interval: 30s port: https-main scheme: https + scrapeTimeout: 30s tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 75d5f36e..49c4a995 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -388,7 +388,7 @@ spec: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} - for: 15m + for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch From ade7b88d654d698def489e860c5e36d522db6c44 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 18 Jul 2018 10:25:09 +0200 Subject: [PATCH 323/638] Update jsonnet dependencies --- manifests/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 4b00b004..cb8cc9d8 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.1.0 + - image: grafana/grafana:5.2.1 name: grafana ports: - containerPort: 3000 From 4d2cddad547057cca141e48b9b4e6d5a70b8027e Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Thu, 19 Jul 2018 16:05:34 +0200 Subject: [PATCH 324/638] contrib: regenerate --- manifests/grafana-dashboardDefinitions.yaml | 34 +++++++++++++++++++-- manifests/grafana-deployment.yaml | 2 +- manifests/prometheus-rules.yaml | 2 +- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index a0dba292..1143970e 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2501,6 +2501,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2509,6 +2510,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2517,6 +2519,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2525,6 +2528,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2533,6 +2537,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -2861,6 +2866,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -2869,6 +2875,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -2877,6 +2884,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -2885,6 +2893,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -2893,6 +2902,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3303,6 +3313,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3311,6 +3322,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3319,6 +3331,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3327,6 +3340,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3335,6 +3349,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -3663,6 +3678,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -3671,6 +3687,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -3679,6 +3696,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -3687,6 +3705,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -3695,6 +3714,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4132,6 +4152,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4140,6 +4161,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4148,6 +4170,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4156,6 +4179,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4164,6 +4188,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -4492,6 +4517,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "A", "step": 10 }, { @@ -4500,6 +4526,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "B", "step": 10 }, { @@ -4508,6 +4535,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "C", "step": 10 }, { @@ -4516,6 +4544,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "D", "step": 10 }, { @@ -4524,6 +4553,7 @@ items: "instant": true, "intervalFactor": 2, "legendFormat": "", + "refId": "E", "step": 10 } ], @@ -5696,14 +5726,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kubelet\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 4b00b004..cb8cc9d8 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.1.0 + - image: grafana/grafana:5.2.1 name: grafana ports: - containerPort: 3000 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 75d5f36e..49c4a995 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -388,7 +388,7 @@ spec: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"} - for: 15m + for: 1h labels: severity: critical - alert: KubeStatefulSetReplicasMismatch From 55dbef6f4377ab6312060f0529cbe513c7cf3605 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Thu, 19 Jul 2018 17:19:22 +0200 Subject: [PATCH 325/638] *: cut 0.22.1 --- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index faca5a84..605b5896 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,8 +19,8 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.0 - image: quay.io/coreos/prometheus-operator:v0.22.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.1 + image: quay.io/coreos/prometheus-operator:v0.22.1 name: prometheus-operator ports: - containerPort: 8080 From 06df9fb67d6b3b6e124df6622ba8eb6f8aead47a Mon Sep 17 00:00:00 2001 From: Max Inden Date: Fri, 20 Jul 2018 15:09:17 +0200 Subject: [PATCH 326/638] bundle.yaml: Bump Prometheus Operator memory request and limit (#1622) When handling big Kubernetes objects, marshalling objects is memory intense. This can be reproduced with the end-to-end test `TestPrometheusRulesExceedingConfigMapLimit`. This patch doubles the memory request and limit of the Prometheus Operator deployment to 100mb and 200mb. --- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index b5be341e..06b295f2 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -29,10 +29,10 @@ spec: resources: limits: cpu: 200m - memory: 100Mi + memory: 200Mi requests: cpu: 100m - memory: 50Mi + memory: 100Mi securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true From d728ab5511ea839448499998358c4e5c53c86d24 Mon Sep 17 00:00:00 2001 From: Ali Rizwan Date: Mon, 23 Jul 2018 12:20:54 +0200 Subject: [PATCH 327/638] Only alert for nodes that currently exist (#1661) --- jsonnet/kube-prometheus/alerts/node.libsonnet | 4 ++-- manifests/prometheus-rules.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 46a5e36d..5c24f09f 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -11,7 +11,7 @@ summary: 'Node disk is running full within 24 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '30m', labels: { @@ -25,7 +25,7 @@ summary: 'Node disk is running full within 2 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '10m', labels: { diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 49c4a995..5af7d2fa 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -744,7 +744,7 @@ spec: full within the next 24 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 24 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{job="node-exporter"} for: 30m labels: severity: warning @@ -754,7 +754,7 @@ spec: full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{job="node-exporter"} for: 10m labels: severity: critical From d1cd95190303c2dd4ce05d7236124c3c707e8948 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Mon, 23 Jul 2018 12:57:03 +0200 Subject: [PATCH 328/638] *: regenerate --- ...erator-0alertmanagerCustomResourceDefinition.yaml | 10 +++++----- ...operator-0prometheusCustomResourceDefinition.yaml | 12 ++++++------ ...ator-0servicemonitorCustomResourceDefinition.yaml | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 6f30397a..9d782f51 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Alertmanager - cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerSpec is a specification of the desired behavior + of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: affinity: description: Affinity is a group of affinity scheduling rules. @@ -2372,9 +2372,9 @@ spec: description: Version the cluster should be on. type: string status: - description: 'Most recent observed status of the Alertmanager cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerStatus is the most recent observed status of the + Alertmanager cluster. Read-only. Not included when requesting from the + apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 140deffa..df1274eb 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Prometheus cluster. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusSpec is a specification of the desired behavior + of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: additionalAlertManagerConfigs: description: SecretKeySelector selects a key of a Secret. @@ -2862,7 +2862,7 @@ spec: description: Peers is a DNS name for Thanos to discover peers through. type: string s3: - description: ThanosSpec defines parameters for of AWS Simple Storage + description: ThanosS3Spec defines parameters for of AWS Simple Storage Service (S3) with Thanos. (S3 compatible services apply as well) properties: accessKey: @@ -2961,9 +2961,9 @@ spec: description: Version of Prometheus to be deployed. type: string status: - description: 'Most recent observed status of the Prometheus cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusStatus is the most recent observed status of the + Prometheus cluster. Read-only. Not included when requesting from the apiserver, + only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index f3068cf8..9d96bfeb 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -169,7 +169,7 @@ spec: description: The label to use to retrieve the job name from. type: string namespaceSelector: - description: A selector for selecting namespaces either selecting all + description: NamespaceSelector is a selector for selecting either all namespaces or a list of namespaces. properties: any: From e47243b413a0e6b657407029eeb13f4de80051a1 Mon Sep 17 00:00:00 2001 From: Saverio Proto Date: Tue, 24 Jul 2018 12:58:40 +0200 Subject: [PATCH 329/638] metrics-server: enable access to nodes/stats Without this access the logs of metrics-server will show the following error line: ``` unable to fully scrape metrics from source kubelet_summary:k8s-1: unable to fetch metrics from Kubelet k8s-1 (10.8.10.14): request failed - "403 Forbidden", response: "Forbidden (user=system:serviceaccount:kube-system:metrics-server, verb=get, resource=nodes, subresource=stats)", ``` and `kubectl top nodes` will give no results --- experimental/metrics-server/metrics-server-cluster-role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/experimental/metrics-server/metrics-server-cluster-role.yaml b/experimental/metrics-server/metrics-server-cluster-role.yaml index 6976f5ce..38844d9a 100644 --- a/experimental/metrics-server/metrics-server-cluster-role.yaml +++ b/experimental/metrics-server/metrics-server-cluster-role.yaml @@ -8,6 +8,7 @@ rules: resources: - pods - nodes + - nodes/stats - namespaces verbs: - get From 8edd622f2e2c03e1c08a0808c041db09d270a757 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 24 Jul 2018 17:17:53 +0200 Subject: [PATCH 330/638] *: cut 0.22.2 (#1683) --- manifests/0prometheus-operator-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 605b5896..358fb6e2 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,8 +19,8 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.1 - image: quay.io/coreos/prometheus-operator:v0.22.1 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.2 + image: quay.io/coreos/prometheus-operator:v0.22.2 name: prometheus-operator ports: - containerPort: 8080 From cf6232c479159fa0c9a05fc77208b4ba9cc993cb Mon Sep 17 00:00:00 2001 From: Mark Eijsermans Date: Tue, 24 Jul 2018 16:28:04 -0700 Subject: [PATCH 331/638] adjust rbac apiGroups to support core workloads api apps/v1 --- .../kube-state-metrics/kube-state-metrics.libsonnet | 13 ++++++++++++- manifests/kube-state-metrics-clusterRole.yaml | 3 +++ manifests/kube-state-metrics-role.yaml | 9 +++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 2805fc9d..2152c65f 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -74,6 +74,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withApiGroups(['apps']) + policyRule.withResources([ 'statefulsets', + 'daemonsets', + 'deployments', + 'replicasets', ]) + policyRule.withVerbs(['list', 'watch']); @@ -222,7 +225,15 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; policyRule.withVerbs(['get', 'update']) + policyRule.withResourceNames(['kube-state-metrics']); - local rules = [coreRule, extensionsRule]; + local appsRule = policyRule.new() + + policyRule.withApiGroups(['apps']) + + policyRule.withResources([ + 'deployments', + ]) + + policyRule.withVerbs(['get', 'update']) + + policyRule.withResourceNames(['kube-state-metrics']); + + local rules = [coreRule, extensionsRule, appsRule]; role.new() + role.mixin.metadata.withName('kube-state-metrics') + diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index cae18483..c519a918 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -34,6 +34,9 @@ rules: - apps resources: - statefulsets + - daemonsets + - deployments + - replicasets verbs: - list - watch diff --git a/manifests/kube-state-metrics-role.yaml b/manifests/kube-state-metrics-role.yaml index 0063ffb4..e03d8898 100644 --- a/manifests/kube-state-metrics-role.yaml +++ b/manifests/kube-state-metrics-role.yaml @@ -19,3 +19,12 @@ rules: verbs: - get - update +- apiGroups: + - apps + resourceNames: + - kube-state-metrics + resources: + - deployments + verbs: + - get + - update From d01cdf96200a144b741a8797126d806f995f8d07 Mon Sep 17 00:00:00 2001 From: seph Date: Wed, 25 Jul 2018 03:20:08 -0400 Subject: [PATCH 332/638] Update GKE kubelet scraping docs (#1682) --- README.md | 2 ++ docs/GKE-cadvisor-support.md | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 67cbd4c1..e58fec6b 100644 --- a/README.md +++ b/README.md @@ -362,6 +362,8 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ As described in the [prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. +If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md]. + #### Authentication problem The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations. diff --git a/docs/GKE-cadvisor-support.md b/docs/GKE-cadvisor-support.md index aeb09514..f1a88825 100644 --- a/docs/GKE-cadvisor-support.md +++ b/docs/GKE-cadvisor-support.md @@ -1,7 +1,21 @@ -# Kubelet / cAdvisor special configuration updates for GKE +# Kubelet / cAdvisor special configuration updates for GKE -In order to allow Prometheus to access the endpoints provided by the kubelet/cAdvisor on GKE we have to downgrade the scheme to HTTP (from HTTPS). +Prior to GKE 1.11, the kubelet does not support token +authentication. Until it does, Prometheus must use HTTP (not HTTPS) +for scraping. +You can configure this behavior through kube-prometheus with: +``` +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet') + + { + _config+:: { + # ... config here + } + }; +``` + +Or, you can patch and re-apply your existing manifests with: On linux: @@ -10,9 +24,9 @@ sed -i -e 's/https/http/g' \ contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml ``` -On MacOs: +On MacOs: -``` +``` sed -i '' -e 's/https/http/g' \ contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml ``` From 6f3eac5f4cd76c91450d2c7a6f4ef60edbb87bff Mon Sep 17 00:00:00 2001 From: Haoyu Wang Date: Wed, 25 Jul 2018 21:22:59 +0800 Subject: [PATCH 333/638] prometheus: Add ability to configure apiserver config of kubernetes_sd_config (#1439) --- ...r-0prometheusCustomResourceDefinition.yaml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index df1274eb..fad5f325 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -671,6 +671,76 @@ spec: type: array required: - alertmanagers + apiserverConfig: + description: 'APIServerConfig defines a host and auth methods to access + apiserver. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config' + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic + authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: Bearer token for accessing apiserver. + type: string + bearerTokenFile: + description: File to read bearer token for accessing apiserver. + type: string + host: + description: Host of apiserver. A valid string consisting of a hostname + or IP followed by an optional port number + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + required: + - host baseImage: description: Base image to use for a Prometheus deployment. type: string From 9a011bb8ac5aadef36b8a7355c1fa501463bc506 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 25 Jul 2018 13:50:01 +0200 Subject: [PATCH 334/638] kube-prometheus: Use locking mechanism for jsonnet dependencies --- .gitignore | 1 - Makefile | 2 +- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.json | 10 --- jsonnetfile.lock.json | 14 ++++ ...0alertmanagerCustomResourceDefinition.yaml | 10 +-- ...r-0prometheusCustomResourceDefinition.yaml | 82 ++----------------- ...ervicemonitorCustomResourceDefinition.yaml | 2 +- .../0prometheus-operator-deployment.yaml | 8 +- manifests/kube-state-metrics-clusterRole.yaml | 3 - manifests/kube-state-metrics-role.yaml | 9 -- 11 files changed, 30 insertions(+), 113 deletions(-) create mode 100644 jsonnetfile.lock.json diff --git a/.gitignore b/.gitignore index dc2549f2..ee91348f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ tmp/ minikube-manifests/ -jsonnetfile.lock.json vendor/ ./auth diff --git a/Makefile b/Makefile index 74ccee14..f40e8104 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ manifests: vendor example.jsonnet build.sh rm -rf manifests ./build.sh -vendor: $(JB_BINARY) jsonnetfile.json +vendor: $(JB_BINARY) jsonnetfile.json jsonnetfile.lock.json rm -rf vendor $(JB_BINARY) install diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 719f0e94..edbf70e5 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "master" + "version": "v0.22.2" }, { "name": "etcd-mixin", diff --git a/jsonnetfile.json b/jsonnetfile.json index dc29c821..b4ebb0f2 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -9,16 +9,6 @@ } }, "version": "." - }, - { - "name": "prometheus-operator", - "source": { - "git": { - "remote": "../../", - "subdir": "jsonnet/prometheus-operator" - } - }, - "version": "." } ] } \ No newline at end of file diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json new file mode 100644 index 00000000..e394e5a7 --- /dev/null +++ b/jsonnetfile.lock.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "kube-prometheus", + "source": { + "git": { + "remote": "../../", + "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" + } + }, + "version": "ca9cf6257548c30a6d3d6e926f38498f96cc3525" + } + ] +} \ No newline at end of file diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 9d782f51..6f30397a 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'AlertmanagerSpec is a specification of the desired behavior - of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'Specification of the desired behavior of the Alertmanager + cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: affinity: description: Affinity is a group of affinity scheduling rules. @@ -2372,9 +2372,9 @@ spec: description: Version the cluster should be on. type: string status: - description: 'AlertmanagerStatus is the most recent observed status of the - Alertmanager cluster. Read-only. Not included when requesting from the - apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'Most recent observed status of the Alertmanager cluster. Read-only. + Not included when requesting from the apiserver, only from the Prometheus + Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index fad5f325..140deffa 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'PrometheusSpec is a specification of the desired behavior - of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'Specification of the desired behavior of the Prometheus cluster. + More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: additionalAlertManagerConfigs: description: SecretKeySelector selects a key of a Secret. @@ -671,76 +671,6 @@ spec: type: array required: - alertmanagers - apiserverConfig: - description: 'APIServerConfig defines a host and auth methods to access - apiserver. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config' - properties: - basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic - authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' - properties: - password: - description: SecretKeySelector selects a key of a Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - optional: - description: Specify whether the Secret or it's key must - be defined - type: boolean - required: - - key - username: - description: SecretKeySelector selects a key of a Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - optional: - description: Specify whether the Secret or it's key must - be defined - type: boolean - required: - - key - bearerToken: - description: Bearer token for accessing apiserver. - type: string - bearerTokenFile: - description: File to read bearer token for accessing apiserver. - type: string - host: - description: Host of apiserver. A valid string consisting of a hostname - or IP followed by an optional port number - type: string - tlsConfig: - description: TLSConfig specifies TLS configuration parameters. - properties: - caFile: - description: The CA cert to use for the targets. - type: string - certFile: - description: The client cert file for the targets. - type: string - insecureSkipVerify: - description: Disable target certificate validation. - type: boolean - keyFile: - description: The client key file for the targets. - type: string - serverName: - description: Used to verify the hostname for the targets. - type: string - required: - - host baseImage: description: Base image to use for a Prometheus deployment. type: string @@ -2932,7 +2862,7 @@ spec: description: Peers is a DNS name for Thanos to discover peers through. type: string s3: - description: ThanosS3Spec defines parameters for of AWS Simple Storage + description: ThanosSpec defines parameters for of AWS Simple Storage Service (S3) with Thanos. (S3 compatible services apply as well) properties: accessKey: @@ -3031,9 +2961,9 @@ spec: description: Version of Prometheus to be deployed. type: string status: - description: 'PrometheusStatus is the most recent observed status of the - Prometheus cluster. Read-only. Not included when requesting from the apiserver, - only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'Most recent observed status of the Prometheus cluster. Read-only. + Not included when requesting from the apiserver, only from the Prometheus + Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 9d96bfeb..f3068cf8 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -169,7 +169,7 @@ spec: description: The label to use to retrieve the job name from. type: string namespaceSelector: - description: NamespaceSelector is a selector for selecting either all + description: A selector for selecting namespaces either selecting all namespaces or a list of namespaces. properties: any: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index a0826980..358fb6e2 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,7 +18,6 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet - - -logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.2 image: quay.io/coreos/prometheus-operator:v0.22.2 @@ -29,13 +28,10 @@ spec: resources: limits: cpu: 200m - memory: 200Mi + memory: 100Mi requests: cpu: 100m - memory: 100Mi - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true + memory: 50Mi nodeSelector: beta.kubernetes.io/os: linux securityContext: diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c519a918..cae18483 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -34,9 +34,6 @@ rules: - apps resources: - statefulsets - - daemonsets - - deployments - - replicasets verbs: - list - watch diff --git a/manifests/kube-state-metrics-role.yaml b/manifests/kube-state-metrics-role.yaml index e03d8898..0063ffb4 100644 --- a/manifests/kube-state-metrics-role.yaml +++ b/manifests/kube-state-metrics-role.yaml @@ -19,12 +19,3 @@ rules: verbs: - get - update -- apiGroups: - - apps - resourceNames: - - kube-state-metrics - resources: - - deployments - verbs: - - get - - update From db39e213dbc330c208c740f3315dab3a24084f43 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 25 Jul 2018 16:37:58 +0200 Subject: [PATCH 335/638] kube-prometheus: Update jsonnet lock --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e394e5a7..6b42cc52 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ca9cf6257548c30a6d3d6e926f38498f96cc3525" + "version": "2169c4435497eb9d1420e7be19b2a0aefaa0135e" } ] } \ No newline at end of file From f8d0de9835e51489989e89e299671b3d4b6ebbfc Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 25 Jul 2018 17:38:20 +0200 Subject: [PATCH 336/638] kube-prometheus: Re-generate --- manifests/kube-state-metrics-clusterRole.yaml | 3 +++ manifests/kube-state-metrics-role.yaml | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index cae18483..c519a918 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -34,6 +34,9 @@ rules: - apps resources: - statefulsets + - daemonsets + - deployments + - replicasets verbs: - list - watch diff --git a/manifests/kube-state-metrics-role.yaml b/manifests/kube-state-metrics-role.yaml index 0063ffb4..e03d8898 100644 --- a/manifests/kube-state-metrics-role.yaml +++ b/manifests/kube-state-metrics-role.yaml @@ -19,3 +19,12 @@ rules: verbs: - get - update +- apiGroups: + - apps + resourceNames: + - kube-state-metrics + resources: + - deployments + verbs: + - get + - update From aedbeca70bce67637e7f15ebcc716d1bfe7d6dc5 Mon Sep 17 00:00:00 2001 From: William Leese Date: Wed, 25 Jul 2018 15:19:15 +0200 Subject: [PATCH 337/638] make generate-in-docker Change-Id: I85f679ea7a33febf5d730960bf754f4b459f53d3 --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6b42cc52..63308503 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "2169c4435497eb9d1420e7be19b2a0aefaa0135e" + "version": "8047c9460b1aec00cece57b7dd523b8f5f0deb58" } ] } \ No newline at end of file From 9db9ae932a250fa7844f31bd2ed9056dacf49889 Mon Sep 17 00:00:00 2001 From: William Leese Date: Thu, 26 Jul 2018 08:44:16 +0200 Subject: [PATCH 338/638] make format generate-in-docker Change-Id: I54f6f26449b0f16b7be2f202ec93e7c29a377108 --- ...ator-0prometheusCustomResourceDefinition.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 140deffa..2498fe59 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2858,6 +2858,22 @@ spec: description: Google Cloud Storage bucket name for stored blocks. If empty it won't store any block inside Google Cloud Storage. type: string + credentials: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string From aee5d45255b63c7858d511cdeba9614e45775bf5 Mon Sep 17 00:00:00 2001 From: William Leese Date: Thu, 26 Jul 2018 09:45:17 +0200 Subject: [PATCH 339/638] pull in dep changes Change-Id: I589eb7b007f9ac035936c95eaa818497607038bc --- jsonnetfile.lock.json | 2 +- ...ator-0prometheusCustomResourceDefinition.yaml | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 63308503..8a0a5654 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "8047c9460b1aec00cece57b7dd523b8f5f0deb58" + "version": "2af790e886cf0f1f8f739618947d43722952d07c" } ] } \ No newline at end of file diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 2498fe59..140deffa 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2858,22 +2858,6 @@ spec: description: Google Cloud Storage bucket name for stored blocks. If empty it won't store any block inside Google Cloud Storage. type: string - credentials: - description: SecretKeySelector selects a key of a Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - optional: - description: Specify whether the Secret or it's key must - be defined - type: boolean - required: - - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string From 85d6c48f9869318115ea12b30eff77f29bbe4076 Mon Sep 17 00:00:00 2001 From: prune Date: Thu, 26 Jul 2018 14:58:12 -0400 Subject: [PATCH 340/638] allow creation of role and rolebindings for other namespaces in jsonnet --- docs/monitoring-other-namespaces.md | 28 +++++++++++++++ .../prometheus/prometheus.libsonnet | 36 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 docs/monitoring-other-namespaces.md diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md new file mode 100644 index 00000000..c1ee7ae7 --- /dev/null +++ b/docs/monitoring-other-namespaces.md @@ -0,0 +1,28 @@ +# Monitoring other Kubernetes Namespaces +This guide will help you monitor applications in other Namespaces, which is only enabled for the `Default` Namespace during Install. + +# Setup +You have to give the list of the Namespaces that you want to be able to monitor. +This is done in the variable `prometheus.roleSpecificNamespaces`. You usually set this in your `.jsonnet` file when building the manifests. + +Ex to create the needed `Role` and `Rolebindig` for the Namespace `foo` : +``` +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + prometheus+:: { + roleSpecificNamespaces: ["foo"], + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +``` \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index e84986f5..d4010d5b 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -21,6 +21,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: 2, rules: {}, renderedRules: {}, + roleSpecificNamespaces: [], }, }, @@ -65,6 +66,20 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), + roleBindingSpecificNamespace: + local roleBinding = k.rbac.v1.roleBinding; + + local newSpecificRoleBinding(namespace) = + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + + roleBinding.mixin.metadata.withNamespace(namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: namespace }]); + + local roleBindigList = k.rbac.v1.roleBindingList; + roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.roleSpecificNamespaces]), clusterRole: local clusterRole = k.rbac.v1.clusterRole; local policyRule = clusterRole.rulesType; @@ -163,6 +178,27 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + role.mixin.metadata.withNamespace('default') + role.withRules(coreRule), + roleSpecificNamespace: + local role = k.rbac.v1.role; + local policyRule = role.rulesType; + local coreRule = policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources([ + 'nodes', + 'services', + 'endpoints', + 'pods', + ]) + + policyRule.withVerbs(['get', 'list', 'watch']); + + local newSpecificRole(namespace) = + role.new() + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + + role.mixin.metadata.withNamespace(namespace) + + role.withRules(coreRule); + + local roleList = k.rbac.v1.roleList; + roleList.new([newSpecificRole(x) for x in $._config.prometheus.roleSpecificNamespaces]), roleBindingKubeSystem: local roleBinding = k.rbac.v1.roleBinding; From 88f79258f50632eb5d061470e416474b3cece513 Mon Sep 17 00:00:00 2001 From: prune Date: Fri, 27 Jul 2018 07:05:42 -0400 Subject: [PATCH 341/638] replaced default namespaces rbac rules by a loop --- docs/monitoring-other-namespaces.md | 4 +- .../prometheus/prometheus.libsonnet | 86 +------------------ 2 files changed, 3 insertions(+), 87 deletions(-) diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md index c1ee7ae7..56c72062 100644 --- a/docs/monitoring-other-namespaces.md +++ b/docs/monitoring-other-namespaces.md @@ -1,5 +1,5 @@ # Monitoring other Kubernetes Namespaces -This guide will help you monitor applications in other Namespaces, which is only enabled for the `Default` Namespace during Install. +This guide will help you monitor applications in other Namespaces. By default the RBAC rules are only enabled for the `Default` and `kube-system` Namespace during Install. # Setup You have to give the list of the Namespaces that you want to be able to monitor. @@ -12,7 +12,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { namespace: 'monitoring', prometheus+:: { - roleSpecificNamespaces: ["foo"], + namespaces: ["default", "kube-system","foo"], }, }, }; diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index d4010d5b..375a8b70 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -21,7 +21,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: 2, rules: {}, renderedRules: {}, - roleSpecificNamespaces: [], + namespaces: ["default", "kube-system",$._config.namespace], }, }, @@ -56,16 +56,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; groups: $._config.prometheus.rules.groups, }, }, - roleBindingDefault: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace('default') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), roleBindingSpecificNamespace: local roleBinding = k.rbac.v1.roleBinding; @@ -123,16 +113,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name + '-config') + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleBindingNamespace: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace($._config.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), clusterRoleBinding: local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; @@ -142,42 +122,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clusterRoleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleKubeSystem: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace('kube-system') + - role.withRules(coreRule), - roleDefault: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace('default') + - role.withRules(coreRule), roleSpecificNamespace: local role = k.rbac.v1.role; local policyRule = role.rulesType; @@ -199,34 +143,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local roleList = k.rbac.v1.roleList; roleList.new([newSpecificRole(x) for x in $._config.prometheus.roleSpecificNamespaces]), - roleBindingKubeSystem: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace('kube-system') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleNamespace: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace($._config.namespace) + - role.withRules(coreRule), prometheus: local container = k.core.v1.pod.mixin.spec.containersType; local resourceRequirements = container.mixin.resourcesType; From 20ec197cd443556cbbae5556668d15dff95fbb26 Mon Sep 17 00:00:00 2001 From: prune Date: Fri, 27 Jul 2018 07:47:03 -0400 Subject: [PATCH 342/638] set right variable name --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 375a8b70..330a022e 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -69,7 +69,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: namespace }]); local roleBindigList = k.rbac.v1.roleBindingList; - roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.roleSpecificNamespaces]), + roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.namespaces]), clusterRole: local clusterRole = k.rbac.v1.clusterRole; local policyRule = clusterRole.rulesType; @@ -142,7 +142,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; role.withRules(coreRule); local roleList = k.rbac.v1.roleList; - roleList.new([newSpecificRole(x) for x in $._config.prometheus.roleSpecificNamespaces]), + roleList.new([newSpecificRole(x) for x in $._config.prometheus.namespaces]), prometheus: local container = k.core.v1.pod.mixin.spec.containersType; local resourceRequirements = container.mixin.resourcesType; From fc382e15cd01788d9d5c53c51369aad55750df14 Mon Sep 17 00:00:00 2001 From: prune Date: Fri, 27 Jul 2018 07:48:17 -0400 Subject: [PATCH 343/638] allow creation of role and rolebindings for other namespaces in jsonnet replaced default namespaces rbac rules by a loop set right variable name --- docs/monitoring-other-namespaces.md | 28 ++++++ .../prometheus/prometheus.libsonnet | 94 +++++-------------- 2 files changed, 51 insertions(+), 71 deletions(-) create mode 100644 docs/monitoring-other-namespaces.md diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md new file mode 100644 index 00000000..56c72062 --- /dev/null +++ b/docs/monitoring-other-namespaces.md @@ -0,0 +1,28 @@ +# Monitoring other Kubernetes Namespaces +This guide will help you monitor applications in other Namespaces. By default the RBAC rules are only enabled for the `Default` and `kube-system` Namespace during Install. + +# Setup +You have to give the list of the Namespaces that you want to be able to monitor. +This is done in the variable `prometheus.roleSpecificNamespaces`. You usually set this in your `.jsonnet` file when building the manifests. + +Ex to create the needed `Role` and `Rolebindig` for the Namespace `foo` : +``` +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + prometheus+:: { + namespaces: ["default", "kube-system","foo"], + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +``` \ No newline at end of file diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index e84986f5..330a022e 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -21,6 +21,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: 2, rules: {}, renderedRules: {}, + namespaces: ["default", "kube-system",$._config.namespace], }, }, @@ -55,16 +56,20 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; groups: $._config.prometheus.rules.groups, }, }, - roleBindingDefault: + roleBindingSpecificNamespace: local roleBinding = k.rbac.v1.roleBinding; - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace('default') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), + local newSpecificRoleBinding(namespace) = + roleBinding.new() + + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + + roleBinding.mixin.metadata.withNamespace(namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: namespace }]); + + local roleBindigList = k.rbac.v1.roleBindingList; + roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.namespaces]), clusterRole: local clusterRole = k.rbac.v1.clusterRole; local policyRule = clusterRole.rulesType; @@ -108,16 +113,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name + '-config') + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleBindingNamespace: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace($._config.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), clusterRoleBinding: local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; @@ -127,10 +122,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clusterRoleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleKubeSystem: + roleSpecificNamespace: local role = k.rbac.v1.role; local policyRule = role.rulesType; - local coreRule = policyRule.new() + policyRule.withApiGroups(['']) + policyRule.withResources([ @@ -140,57 +134,15 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'pods', ]) + policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace('kube-system') + - role.withRules(coreRule), - roleDefault: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace('default') + - role.withRules(coreRule), - roleBindingKubeSystem: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.metadata.withNamespace('kube-system') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleNamespace: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'nodes', - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + - role.mixin.metadata.withNamespace($._config.namespace) + - role.withRules(coreRule), + + local newSpecificRole(namespace) = + role.new() + + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + + role.mixin.metadata.withNamespace(namespace) + + role.withRules(coreRule); + + local roleList = k.rbac.v1.roleList; + roleList.new([newSpecificRole(x) for x in $._config.prometheus.namespaces]), prometheus: local container = k.core.v1.pod.mixin.spec.containersType; local resourceRequirements = container.mixin.resourcesType; From 1b92824fbddb8a4972080e8f2afa35db760cb998 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Tue, 31 Jul 2018 12:56:40 -0500 Subject: [PATCH 344/638] kube-prometheus: Update table of contents --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e58fec6b..0db71c22 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,13 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Configuration](#configuration) * [Customization](#customization) * [Alertmanager configuration](#alertmanager-configuration) + * [Static etcd configuration](#static-etcd-configuration) * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) * [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) * [Minikube Example](#minikube-example) * [Troubleshooting](#troubleshooting) * [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics) + * [kube-state-metrics resource usage](#kube-state-metrics-resource-usage) ## Prerequisites From 7d5c81c230b9430336940d0bca69f3e593bcfb21 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Tue, 31 Jul 2018 13:02:40 -0500 Subject: [PATCH 345/638] kube-prometheus: fixing/adding some links --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0db71c22..e33c237f 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ $ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonn > `jb` can be installed with `go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb` -You may wish to not use ksonnet and simply render the generated manifests to files on disk, this can be done with: +You may wish to not use ksonnet and simply render the generated manifests to files on disk, this can be done by running `build.sh` with [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) ```jsonnet @@ -99,7 +99,7 @@ This renders all manifests in a json structure of `{filename: manifest-content}` ### Compiling -To compile the above and get each manifest in a separate file on disk use the following script: +To compile the above and get each manifest in a separate file on disk use the following [build.sh](build.sh) script (i.e. `./build.sh example.jsonnet`): [embedmd]:# (build.sh) ```sh @@ -183,7 +183,7 @@ The grafana definition is located in a different project (https://github.com/bra Jsonnet is a turing complete language, any logic can be reflected in it. It also has powerful merge functionalities, allowing sophisticated customizations of any kind simply by merging it into the object the library provides. -A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm]() and [bootkube]() clusters there are mixins available to easily configure these: +A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet) and [bootkube](examples/jsonnet-snippets/bootkube.jsonnet) and [kops](examples/jsonnet-snippets/kops.jsonnet) clusters there are mixins available to easily configure these: kubeadm: @@ -217,7 +217,7 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') ``` -For example the name of the `Prometheus` object provided by this library can be overridden: +To give another customization example, the name of the `Prometheus` object provided by this library can be overridden: [embedmd]:# (examples/prometheus-name-override.jsonnet) ```jsonnet @@ -331,7 +331,7 @@ See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertman ## Minikube Example -To use an easy to reproduce example, let's take the minikube setup as demonstrated in [prerequisites](#Prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: +To use an easy to reproduce example, let's take the minikube setup as demonstrated in [Prerequisites](#prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: > Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. @@ -362,7 +362,7 @@ local kp = Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets. -As described in the [prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. +As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md]. From 64f3e5def945672a63efc338ff7a4cbf74777b2f Mon Sep 17 00:00:00 2001 From: prune Date: Tue, 31 Jul 2018 14:49:13 -0400 Subject: [PATCH 346/638] added newline in doc file --- docs/monitoring-other-namespaces.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md index 56c72062..2e5289c4 100644 --- a/docs/monitoring-other-namespaces.md +++ b/docs/monitoring-other-namespaces.md @@ -25,4 +25,4 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } -``` \ No newline at end of file +``` From d58ccab5d0969162fd66c1c32560091dd119d0bc Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Tue, 31 Jul 2018 13:59:50 -0500 Subject: [PATCH 347/638] kube-prometheus: commands to create and teardown the stack Resolves issues #1571 and #1609 in this repository. --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e33c237f..9590a654 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,17 @@ $ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 - ## Quickstart -Although this project is intended to be used as a library, a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content out quickly. - -Simply create the stack: +This project is intended to be used as a library (i.e. the intent is not for you to create your own customized copy of this repository). +Though a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content out quickly. + * Simply create the stack: ``` -$ kubectl create -f manifests/ +$ kubectl create -f manifests/ || true +$ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice +``` + * And to teardown the stack: +``` +$ kubectl delete -f manifests/ || true ``` ## Usage From 532f546dcfe736dbf01c44d2b84a46197e7adda8 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Tue, 31 Jul 2018 14:36:55 -0500 Subject: [PATCH 348/638] kube-prometheus: add info about installing & compiling --- README.md | 57 ++++++++++++++++++++++++++++++++++++++++++++----------- build.sh | 4 +++- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 9590a654..d42697a7 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,9 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [minikube](#minikube) * [Quickstart](#quickstart) * [Usage](#usage) + * [Installing](#installing) * [Compiling](#compiling) + * [Containerized Installing and Compiling](#containerized-installing-and-compiling) * [Configuration](#configuration) * [Customization](#customization) * [Alertmanager configuration](#alertmanager-configuration) @@ -69,19 +71,27 @@ $ kubectl delete -f manifests/ || true ## Usage +### Installing + The content of this project consists of a set of [jsonnet](http://jsonnet.org/) files making up a library to be consumed. Install this library in your own project with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install): - ``` $ mkdir my-kube-prometheus; cd my-kube-prometheus -$ jb init -$ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus +$ jb init # Creates the initial/empty `jsonnetfile.json` +$ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` ``` > `jb` can be installed with `go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb` -You may wish to not use ksonnet and simply render the generated manifests to files on disk, this can be done by running `build.sh` with [example.jsonnet](example.jsonnet): +> An e.g. of how to install a given version of this library: `jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus/@v0.22.0` + +### Compiling + +You may wish to not use ksonnet/jsonnet and simply render the generated manifests to files on disk (in which case you still don't need a copy of this entire repository, but rather only a copy of a few select files). +This can be done e.g. by running `./build.sh example.jsonnet`. + +Here's [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) ```jsonnet @@ -100,15 +110,14 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` -This renders all manifests in a json structure of `{filename: manifest-content}`. - -### Compiling - -To compile the above and get each manifest in a separate file on disk use the following [build.sh](build.sh) script (i.e. `./build.sh example.jsonnet`): +And here's the [build.sh](build.sh) script (this renders all manifests in a json structure of `{filename: manifest-content}`): [embedmd]:# (build.sh) ```sh #!/usr/bin/env bash + +# This script uses arg $1 (name of *.jsonnet file to use) to generate the manifests/*.yaml files. + set -e set -x # only exit with zero if all commands of the pipeline exit successfully @@ -123,11 +132,37 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} ``` -> Note you need `jsonnet` and `gojsonyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. +> Note you need `jsonnet` and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. This script reads each key of the generated json and uses that as the file name, and writes the value of that key to that file. -> You can also run this script executing the command `make generate-raw` from kube-prometheus base directory of this repository but the above option it is recommended so that you run it in your own infrastructure repository. +### Containerized Installing and Compiling + +If you don't care to have `jb` or `jsonnet` or `gojsontoyaml` installed, then build the `po-jsonnet` Docker image (this is something you'll need a copy of this repository for). Do the following from this `kube-prometheus` directory: +``` +$ make ../../hack/jsonnet-docker-image +``` + +Then you can do commands such as the following: +``` +docker run \ + --rm \ + -v `pwd`:`pwd` \ + --workdir `pwd` \ + po-jsonnet jb init + +docker run \ + --rm \ + -v `pwd`:`pwd` \ + --workdir `pwd` \ + po-jsonnet jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus + +docker run \ + --rm \ + -v `pwd`:`pwd` \ + --workdir `pwd` \ + po-jsonnet ./build.sh example.jsonnet +``` ## Configuration diff --git a/build.sh b/build.sh index 4eaf5ac5..8e9d4342 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash + +# This script uses arg $1 (name of *.jsonnet file to use) to generate the manifests/*.yaml files. + set -e set -x # only exit with zero if all commands of the pipeline exit successfully @@ -10,4 +13,3 @@ mkdir manifests # optional, but we would like to generate yaml, not json jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml; rm -f {}' -- {} - From cfa8ab7911022efd6858b55c8b897ffb192df431 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Tue, 31 Jul 2018 14:44:24 -0500 Subject: [PATCH 349/638] kube-prometheus: add more section headings under Customization --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index d42697a7..6dac4bd3 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,10 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Containerized Installing and Compiling](#containerized-installing-and-compiling) * [Configuration](#configuration) * [Customization](#customization) + * [Cluster Creation Tools](cluster-creation-tools) + * [NodePorts](nodeports) + * [Prometheus Object Name](prometheus-object-name) + * [node-exporter DaemonSet namespace](node-exporter-daemonset-namespace) * [Alertmanager configuration](#alertmanager-configuration) * [Static etcd configuration](#static-etcd-configuration) * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) @@ -223,6 +227,8 @@ The grafana definition is located in a different project (https://github.com/bra Jsonnet is a turing complete language, any logic can be reflected in it. It also has powerful merge functionalities, allowing sophisticated customizations of any kind simply by merging it into the object the library provides. +### Cluster Creation Tools + A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet) and [bootkube](examples/jsonnet-snippets/bootkube.jsonnet) and [kops](examples/jsonnet-snippets/kops.jsonnet) clusters there are mixins available to easily configure these: kubeadm: @@ -249,6 +255,8 @@ kops: (import 'kube-prometheus/kube-prometheus-kops.libsonnet') ``` +### NodePorts + Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: [embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) @@ -257,6 +265,8 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') ``` +### Prometheus Object Name + To give another customization example, the name of the `Prometheus` object provided by this library can be overridden: [embedmd]:# (examples/prometheus-name-override.jsonnet) @@ -272,6 +282,8 @@ To give another customization example, the name of the `Prometheus` object provi }).prometheus.prometheus ``` +### node-exporter DaemonSet namespace + Standard Kubernetes manifests are all written using [ksonnet-lib](https://github.com/ksonnet/ksonnet-lib/), so they can be modified with the mixins supplied by ksonnet-lib. For example to override the namespace of the node-exporter DaemonSet: [embedmd]:# (examples/ksonnet-example.jsonnet) From d0020e4863f764cf21150c4bb3f4a12c52847e4c Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 31 Jul 2018 17:26:26 +0200 Subject: [PATCH 350/638] prometheus: Enqueue both P in NS as well as P selecting obj in NS By default if a new object (servicemonitor, secret, ...) is created / updated / deleted, we reconcile all Prometheus instances in that namespace. In addition we also need to reconcile all Prometheus instances selecting objects (ServiceMonitors, PrometheusRules) in that namespace. --- manifests/grafana-dashboardDefinitions.yaml | 40 ++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1143970e..e3fa61c7 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4792,7 +4792,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "100 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])) * 100)\n", + "expr": "1 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])))\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{cpu}}", @@ -4822,18 +4822,18 @@ items: }, "yaxes": [ { - "format": "percent", + "format": "percentunit", "label": null, "logBase": 1, - "max": 100, + "max": 1, "min": 0, "show": true }, { - "format": "percent", + "format": "percentunit", "label": null, "logBase": 1, - "max": 100, + "max": 1, "min": 0, "show": true } @@ -4883,21 +4883,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"} * 100", + "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 1m", "refId": "A" }, { - "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"} * 100", + "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 5m", "refId": "B" }, { - "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"} * 100", + "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 15m", @@ -4927,7 +4927,7 @@ items: }, "yaxes": [ { - "format": "percent", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -4935,7 +4935,7 @@ items: "show": true }, { - "format": "percent", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -5002,28 +5002,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n", + "expr": "max(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "max(node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "max(node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "max(node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5131,7 +5131,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n- node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n) * 100\n /\nnode_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n", + "expr": "max(\n (\n (\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5215,21 +5215,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (instance) (rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "sum by (instance) (rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5414,7 +5414,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", + "expr": "max(rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5505,7 +5505,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m])", + "expr": "max(rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", From 15cc3e059d6dda9e64cdfe54c0845261027e3ed0 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Wed, 1 Aug 2018 10:37:31 +0200 Subject: [PATCH 351/638] */README.md: Include 'Contributing' section --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index e58fec6b..b653dc29 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Minikube Example](#minikube-example) * [Troubleshooting](#troubleshooting) * [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics) +* [Contributing](#contributing) ## Prerequisites @@ -391,3 +392,15 @@ config. They default to: memoryPerNode: '30Mi', } ``` + +## Contributing + +All `.yaml` files in the `/manifests` folder are generated via +[Jsonnet](https://jsonnet.org/). Contributing changes will most likely include +the following process: + +1. Make your changes in the respective `*.jsonnet` file. +2. Commit your changes (This is currently necessary due to our vendoring + process. This is likely to change in the future). +3. Generate dependent `*.yaml` files: `make generate-in-docker`. +4. Commit the generated changes. From 78790408959155ec7245b2affb4ec57edd6e58ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 1 Aug 2018 11:40:00 +0200 Subject: [PATCH 352/638] kube-prometheus: Fix jsonnet lock file --- jsonnetfile.lock.json | 72 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 8a0a5654..64e89c9b 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,77 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "2af790e886cf0f1f8f739618947d43722952d07c" + "version": "d7afb094898de5817f3cb49807dee44513d6d121" + }, + { + "name": "ksonnet", + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib", + "subdir": "" + } + }, + "version": "83f20ee933bcd13fcf4ad1b49a40c92135c5569c" + }, + { + "name": "kubernetes-mixin", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "" + } + }, + "version": "2644ba2e0002715d0a079b5e1214128f72cbcc3b" + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "0fdef020a6360415d2c8fdc82b29122583e4df05" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "42f72ac60fad1022d26f1a88062689e94219d582" + }, + { + "name": "grafana", + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana", + "subdir": "grafana" + } + }, + "version": "8f131a315dfe877819c3a490eedfbcfe183b95cf" + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "87000af76132515a1a2721e836c282d97f82593b" + }, + { + "name": "etcd-mixin", + "source": { + "git": { + "remote": "https://github.com/coreos/etcd", + "subdir": "Documentation/etcd-mixin" + } + }, + "version": "93be31d43a2728d1750120aeae7a483698ccead2" } ] } \ No newline at end of file From facf7cc0e1dd4cc52a7dbba1879226c453009b39 Mon Sep 17 00:00:00 2001 From: prune Date: Wed, 1 Aug 2018 07:35:28 -0400 Subject: [PATCH 353/638] minor update as requested in PR --- docs/monitoring-other-namespaces.md | 2 +- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md index 2e5289c4..8327ed02 100644 --- a/docs/monitoring-other-namespaces.md +++ b/docs/monitoring-other-namespaces.md @@ -5,7 +5,7 @@ This guide will help you monitor applications in other Namespaces. By default th You have to give the list of the Namespaces that you want to be able to monitor. This is done in the variable `prometheus.roleSpecificNamespaces`. You usually set this in your `.jsonnet` file when building the manifests. -Ex to create the needed `Role` and `Rolebindig` for the Namespace `foo` : +Example to create the needed `Role` and `Rolebindig` for the Namespace `foo` : ``` local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 330a022e..026acad0 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -56,7 +56,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; groups: $._config.prometheus.rules.groups, }, }, - roleBindingSpecificNamespace: + roleBindingSpecificNamespaces: local roleBinding = k.rbac.v1.roleBinding; local newSpecificRoleBinding(namespace) = @@ -122,7 +122,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clusterRoleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]), - roleSpecificNamespace: + roleSpecificNamespaces: local role = k.rbac.v1.role; local policyRule = role.rulesType; local coreRule = policyRule.new() + From b3ab55560369c503834604090388ace7c65c0c80 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 1 Aug 2018 07:58:40 -0500 Subject: [PATCH 354/638] kube-prometheus: expand the minikube.jsonnet example --- README.md | 23 +--------------------- examples/minikube.jsonnet | 41 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 6dac4bd3..53c230f0 100644 --- a/README.md +++ b/README.md @@ -383,31 +383,10 @@ See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertman ## Minikube Example -To use an easy to reproduce example, let's take the minikube setup as demonstrated in [Prerequisites](#prerequisites). It is a kubeadm cluster (as we use the kubeadm bootstrapper) and because we would like easy access to our Prometheus, Alertmanager and Grafana UI we want the services to be exposed as NodePort type services: +To use an easy to reproduce example, see [minikube.jsonnet](examples/minikube.jsonnet), which uses the minikube setup as demonstrated in [Prerequisites](#prerequisites). Because we would like easy access to our Prometheus, Alertmanager and Grafana UIs, `minikube.jsonnet` exposes the services as NodePort type services. > Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. -[embedmd]:# (examples/minikube.jsonnet) -```jsonnet -local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + - (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - { - _config+:: { - namespace: 'monitoring', - }, - }; - -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } -``` - ## Troubleshooting ### Error retrieving kubelet metrics diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index ed1a05c4..9eda7c4f 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -5,6 +5,47 @@ local kp = { _config+:: { namespace: 'monitoring', + alertmanager+:: { + config: importstr 'alertmanager-config.yaml', + }, + grafana+:: { + config: { + sections: { + // Do not require grafana users to login/authenticate + "auth.anonymous": {enabled: true}, + }, + }, + }, + }, + + // For simplicity, each of the following values for 'externalUrl': + // * assume that `minikube ip` prints "192.168.99.100" + // * hard-code the NodePort for each app + prometheus+:: { + prometheus+: { + // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec + spec+: { + // An e.g. of the purpose of this is so the "Source" links on http:///#/alerts are valid. + externalUrl: "http://192.168.99.100:30900", + + // Reference info: "external_labels" on https://prometheus.io/docs/prometheus/latest/configuration/configuration/ + externalLabels: { + // This 'cluster' label will be included on every firing prometheus alert. (This is more useful + // when running multiple clusters in a shared environment (e.g. AWS) with other users.) + cluster: "minikube-", + }, + }, + }, + }, + alertmanager+:: { + alertmanager+: { + // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec + spec+: { + externalUrl: "http://192.168.99.100:30903", + + logLevel: "debug", // So firing alerts show up in log + }, + }, }, }; From fe923a723918861eb72d5d925c3a89f841d6f201 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 1 Aug 2018 08:03:17 -0500 Subject: [PATCH 355/638] kube-prometheus: fix some links --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 53c230f0..b299d2d6 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,10 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Containerized Installing and Compiling](#containerized-installing-and-compiling) * [Configuration](#configuration) * [Customization](#customization) - * [Cluster Creation Tools](cluster-creation-tools) - * [NodePorts](nodeports) - * [Prometheus Object Name](prometheus-object-name) - * [node-exporter DaemonSet namespace](node-exporter-daemonset-namespace) + * [Cluster Creation Tools](#cluster-creation-tools) + * [NodePorts](#nodeports) + * [Prometheus Object Name](#prometheus-object-name) + * [node-exporter DaemonSet namespace](#node-exporter-daemonset-namespace) * [Alertmanager configuration](#alertmanager-configuration) * [Static etcd configuration](#static-etcd-configuration) * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) From bed6e4865aaec6569663b2f798347ecb82b49911 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 1 Aug 2018 09:17:58 -0500 Subject: [PATCH 356/638] kube-prometheus: update etcd info Resolves issue #1629 in this repository. --- README.md | 32 +---- docs/monitoring-external-etcd.md | 126 ++---------------- examples/etcd.jsonnet | 23 +++- examples/minikube.jsonnet | 1 + .../kube-prometheus-static-etcd.libsonnet | 4 +- 5 files changed, 36 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index b299d2d6..66c8c38b 100644 --- a/README.md +++ b/README.md @@ -343,35 +343,9 @@ In the above example the configuration has been inlined, but can just as well be ``` ### Static etcd configuration -In order to configure a static etcd cluster to scrape there is a simple mixin prepared, so only the IPs and certificate information need to be configured. Simply append the `kube-prometheus/kube-prometheus-static-etcd.libsonnet` mixin to the rest of the configuration, and configure the `ips` to be the IPs to scrape, and the `clientCA`, `clientKey` and `clientCert` to values that are valid to scrape etcd metrics with. +In order to configure a static etcd cluster to scrape there is a simple [kube-prometheus-static-etcd.libsonnet](jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) mixin prepared - see [etcd.jsonnet](examples/etcd.jsonnet) for an example of how to use that mixin, and [Monitoring external etcd](docs/monitoring-external-etcd.md) for more information. -Most likely these certificates are generated somewhere in an infrastructure repository, so using the jsonnet `importstr` function can be useful here. All the sensitive information on the certificates will end up in a Kubernetes Secret. - -[embedmd]:# (examples/etcd.jsonnet) -```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', - - etcd+:: { - ips: ['127.0.0.1'], - clientCA: importstr 'etcd-client-ca.crt', - clientKey: importstr 'etcd-client.key', - clientCert: importstr 'etcd-client.crt', - serverName: 'etcd.my-cluster.local', - }, - }, -}; - -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } -``` +> Note that monitoring etcd in minikube is currently not possible because of how etcd is setup. (minikube's etcd binds to 127.0.0.1:2379 only, and within host networking namespace.) ### Customizing Prometheus alerting/recording rules and Grafana dashboards @@ -385,8 +359,6 @@ See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertman To use an easy to reproduce example, see [minikube.jsonnet](examples/minikube.jsonnet), which uses the minikube setup as demonstrated in [Prerequisites](#prerequisites). Because we would like easy access to our Prometheus, Alertmanager and Grafana UIs, `minikube.jsonnet` exposes the services as NodePort type services. -> Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. - ## Troubleshooting ### Error retrieving kubelet metrics diff --git a/docs/monitoring-external-etcd.md b/docs/monitoring-external-etcd.md index bfdc58a3..1e26af0f 100644 --- a/docs/monitoring-external-etcd.md +++ b/docs/monitoring-external-etcd.md @@ -2,119 +2,11 @@ This guide will help you monitor an external etcd cluster. When the etcd cluster is not hosted inside Kubernetes. This is often the case with Kubernetes setups. This approach has been tested with kube-aws but the same principals apply to other tools. -# Step 1 - Make the etcd certificates available to Prometheus pod -Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure. +Note that [etcd.jsonnet](../examples/etcd.jsonnet) & [kube-prometheus-static-etcd.libsonnet](../jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) (which are described by a section of the [Readme](../README.md#static-etcd-configuration)) do the following: + * Put the three etcd TLS client files (CA & cert & key) into a secret in the namespace, and have Prometheus Operator load the secret. + * Create the following (to expose etcd metrics - port 2379): a Service, Endpoint, & ServiceMonitor. -## a - Create the secrets in the namespace -Prometheus Operator allows us to mount secrets in the pod. By loading the secrets as files, they can be made available inside the Prometheus pod. - -`kubectl -n monitoring create secret generic etcd-certs --from-file=CREDENTIAL_PATH/etcd-client.pem --from-file=CREDENTIAL_PATH/etcd-client-key.pem --from-file=CREDENTIAL_PATH/ca.pem` - -where CREDENTIAL_PATH is the path to your etcd client credentials on your work machine. -(Kube-aws stores them inside the credential folder). - -## b - Get Prometheus Operator to load the secret -In the previous step we have named the secret 'etcd-certs'. - -Edit prometheus-operator/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml and add the secret under the spec of the Prometheus object manifest: - -``` - secrets: - - etcd-certs -``` - -The manifest will look like that: -``` -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: k8s - labels: - prometheus: k8s -spec: - replicas: 2 - secrets: - - etcd-certs - version: v1.7.1 -``` - -If your Prometheus Operator is already in place, update it: - -`kubectl -n monitoring replace -f contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml - -# Step 2 - Create the Service, endpoints and ServiceMonitor - -The below manifest creates a Service to expose etcd metrics (port 2379) - -* Replace `IP_OF_YOUR_ETCD_NODE_[0/1/2]` with the IP addresses of your etcd nodes. If you have more than one node, add them to the same list. -* Use `#insecureSkipVerify: true` or replace `ETCD_DNS_OR_ALTERNAME_NAME` with a valid name for the certificate. - -In case you have generated the etcd certificated with kube-aws, you will need to use insecureSkipVerify as the valid certificate domain will be different for each etcd node (etcd0, etcd1, etcd2). If you only have one etcd node, you can use the value from `etcd.internalDomainName` speficied in your kube-aws `cluster.yaml` - -In this example we use insecureSkipVerify: true as kube-aws default certificates are not valid against the IP. They were created for the DNS. Depending on your use case, you might want to remove this flag or set it to false. (true required for kube-aws if using default certificate generators method) - -``` -apiVersion: v1 -kind: Service -metadata: - name: etcd-k8s - labels: - k8s-app: etcd -spec: - type: ClusterIP - clusterIP: None - ports: - - name: api - port: 2379 - protocol: TCP ---- -apiVersion: v1 -kind: Endpoints -metadata: - name: etcd-k8s - labels: - k8s-app: etcd -subsets: -- addresses: - - ip: IP_OF_YOUR_ETCD_NODE_0 - nodeName: etcd0 - - ip: IP_OF_YOUR_ETCD_NODE_1 - nodeName: etcd1 - - ip: IP_OF_YOUR_ETCD_NODE_2 - nodeName: etcd2 - ports: - - name: api - port: 2379 - protocol: TCP ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: etcd-k8s - labels: - k8s-app: etcd-k8s -spec: - jobLabel: k8s-app - endpoints: - - port: api - interval: 30s - scheme: https - tlsConfig: - caFile: /etc/prometheus/secrets/etcd-certs/ca.pem - certFile: /etc/prometheus/secrets/etcd-certs/etcd-client.pem - keyFile: /etc/prometheus/secrets/etcd-certs/etcd-client-key.pem - #use insecureSkipVerify only if you cannot use a Subject Alternative Name - #insecureSkipVerify: true - serverName: ETCD_DNS_OR_ALTERNAME_NAME - selector: - matchLabels: - k8s-app: etcd - namespaceSelector: - matchNames: - - monitoring -``` - -# Step 3: Open the port +# Step 1: Open the port You now need to allow the nodes Prometheus are running on to talk to the etcd on the port 2379 (if 2379 is the port used by etcd to expose the metrics) @@ -128,11 +20,11 @@ With kube-aws, each etcd node has two IP addresses: For some reason, some etcd node answer to :2379/metrics on the intance IP (eth0), some others on the EIP|ENI address (eth1). See issue https://github.com/kubernetes-incubator/kube-aws/issues/923 It would be of course much better if we could hit the EPI/ENI all the time as they don't change even if the underlying EC2 intance goes down. -If specifying the Instance IP (eth0) in the Prometheus Operator ServiceMonitor, and the EC2 intance goes down, one would have to update the ServiceMonitor. +If specifying the Instance IP (eth0) in the Prometheus Operator ServiceMonitor, and the EC2 intance goes down, one would have to update the ServiceMonitor. Another idea woud be to use the DNS entries of etcd, but those are not currently supported for EndPoints objects in Kubernetes. -# Step 4: verify +# Step 2: verify Go to the Prometheus UI on :9090/config and check that you have an etcd job entry: ``` @@ -142,9 +34,11 @@ Go to the Prometheus UI on :9090/config and check that you have an etcd job entr ... ``` -On the :9090/targets page, you should see "etcd" with the UP state. If not, check the Error column for more information. +On the :9090/targets page: + * You should see "etcd" with the UP state. If not, check the Error column for more information. + * If no "etcd" targets are even shown on this page, prometheus isn't attempting to scrape it. -# Step 5: Grafana dashboard +# Step 3: Grafana dashboard ## Find a dashboard you like diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index c521d1cd..fcce57be 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -3,12 +3,29 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + _config+:: { namespace: 'monitoring', + // Reference info: https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/README.md#static-etcd-configuration etcd+:: { + // Configure this to be the IP(s) to scrape - i.e. your etcd node(s) (use commans to separate multiple values). ips: ['127.0.0.1'], - clientCA: importstr 'etcd-client-ca.crt', - clientKey: importstr 'etcd-client.key', - clientCert: importstr 'etcd-client.crt', + + // Set these three variables to values that are valid to scrape etcd metrics with (check the apiserver container). + // Most likely these certificates are generated somewhere in an infrastructure repository, so using the jsonnet `importstr` function can + // be useful here. (Kube-aws stores these three files inside the credential folder.) + // All the sensitive information on the certificates will end up in a Kubernetes Secret. + clientCA: importstr '/path-on-your-work-machine/etcd-client-ca.crt', + clientKey: importstr '/path-on-your-work-machine/etcd-client.key', + clientCert: importstr '/path-on-your-work-machine/etcd-client.crt', + + // A valid name for the certificate serverName: 'etcd.my-cluster.local', + + // TODO: enhance kube-prometheus-static-etcd.libsonnet to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). + // Note that insecureSkipVerify is only to be used if you cannot use a Subject Alternative Name. + + // In case you have generated the etcd certificate with kube-aws: + // * If you only have one etcd node, you can use the value from 'etcd.internalDomainName' (specified in your kube-aws cluster.yaml) as the value for 'serverName'. + // * But if you have multiple etcd nodes, you will need to use 'insecureSkipVerify: true' (if using default certificate generators method), as the valid certificate domain + // will be different for each etcd node. (kube-aws default certificates are not valid against the IP - they were created for the DNS.) }, }, }; diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index 9eda7c4f..e1440798 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -1,6 +1,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + // Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + { _config+:: { diff --git a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet index 23883c2c..04a820f3 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet @@ -61,6 +61,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; port: 'metrics', interval: '30s', scheme: 'https', + // Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure. tlsConfig: { caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt', keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key', @@ -77,8 +78,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, }, secretEtcdCerts: + // Prometheus Operator allows us to mount secrets in the pod. By loading the secrets as files, they can be made available inside the Prometheus pod. local secret = k.core.v1.secret; - secret.new('kube-etcd-client-certs', { 'etcd-client-ca.crt': std.base64($._config.etcd.clientCA), 'etcd-client.key': std.base64($._config.etcd.clientKey), @@ -87,6 +88,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; secret.mixin.metadata.withNamespace($._config.namespace), prometheus+: { + // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec spec+: { secrets+: [$.prometheus.secretEtcdCerts.metadata.name], }, From a9f4752b303c9f3aad1514cd79f43ce74b3da7cc Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 1 Aug 2018 15:59:14 -0500 Subject: [PATCH 357/638] kube-prometheus: mention command to install jsonnet --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 66c8c38b..3e9e6317 100644 --- a/README.md +++ b/README.md @@ -136,13 +136,13 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} ``` -> Note you need `jsonnet` and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. +> Note you need `jsonnet` (`go get github.com/google/go-jsonnet/jsonnet`) and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. This script reads each key of the generated json and uses that as the file name, and writes the value of that key to that file. ### Containerized Installing and Compiling -If you don't care to have `jb` or `jsonnet` or `gojsontoyaml` installed, then build the `po-jsonnet` Docker image (this is something you'll need a copy of this repository for). Do the following from this `kube-prometheus` directory: +If you don't care to have `jb` nor `jsonnet` nor `gojsontoyaml` installed, then build the `po-jsonnet` Docker image (this is something you'll need a copy of this repository for). Do the following from this `kube-prometheus` directory: ``` $ make ../../hack/jsonnet-docker-image ``` From e8e0b639f7098116f558422d40a0433d851d6e20 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 1 Aug 2018 16:39:11 -0500 Subject: [PATCH 358/638] Incorporate the updates that were in PR #1727 --- README.md | 113 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 65e6081a..daa0619c 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,12 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Prerequisites](#prerequisites) * [minikube](#minikube) * [Quickstart](#quickstart) -* [Usage](#usage) +* [Customizing Kube-Prometheus](#customizing-kube-prometheus) * [Installing](#installing) * [Compiling](#compiling) * [Containerized Installing and Compiling](#containerized-installing-and-compiling) * [Configuration](#configuration) -* [Customization](#customization) +* [Customization Examples](#customization-examples) * [Cluster Creation Tools](#cluster-creation-tools) * [NodePorts](#nodeports) * [Prometheus Object Name](#prometheus-object-name) @@ -61,9 +61,9 @@ $ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 - ## Quickstart -This project is intended to be used as a library (i.e. the intent is not for you to create your own customized copy of this repository). +This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). -Though a compiled version of the Kubernetes manifests generated with this library is checked into this repository in order to try the content out quickly. +Though for a quickstart a compiled version of the Kubernetes manifests generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: * Simply create the stack: ``` $ kubectl create -f manifests/ || true @@ -74,16 +74,19 @@ $ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may $ kubectl delete -f manifests/ || true ``` -## Usage +## Customizing Kube-Prometheus + +This section describes how to customize the kube-prometheus library. ### Installing The content of this project consists of a set of [jsonnet](http://jsonnet.org/) files making up a library to be consumed. -Install this library in your own project with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install): +Install this library in your own project with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install) (the jsonnet package manager): ``` $ mkdir my-kube-prometheus; cd my-kube-prometheus $ jb init # Creates the initial/empty `jsonnetfile.json` +# Install the kube-prometheus dependency $ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` ``` @@ -91,6 +94,9 @@ $ jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonn > An e.g. of how to install a given version of this library: `jb install github.com/coreos/prometheus-operator/contrib/kube-prometheus/jsonnet/kube-prometheus/@v0.22.0` +In order to update the kube-prometheus dependency, simply use the jsonnet-bundler update functionality: +`$ jb update` + ### Compiling You may wish to not use ksonnet/jsonnet and simply render the generated manifests to files on disk (in which case you still don't need a copy of this entire repository, but rather only a copy of a few select files). @@ -137,9 +143,9 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} ``` -> Note you need `jsonnet` (`go get github.com/google/go-jsonnet/jsonnet`) and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. +> Note you need `jsonnet` (`go get github.com/google/go-jsonnet/jsonnet`) and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed to run `build.sh`. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. -This script reads each key of the generated json and uses that as the file name, and writes the value of that key to that file. +This script runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml. ### Containerized Installing and Compiling @@ -171,48 +177,76 @@ docker run \ ## Configuration -A hidden `_config` field is located at the top level of the object this library provides. These are the available fields with their respective default values: +Jsonnet has the concept of hidden fields. These are fields, that are not going to be rendered in a result. This is used to configure the kube-prometheus components in jsonnet. In the example jsonnet code of the above [Usage section](#Usage), you can see an example of this, where the `namespace` is being configured to be `monitoring`. In order to not override the whole object, use the `+::` construct of jsonnet, to merge objects, this way you can override individual settings, but retain all other settings and defaults. +These are the available fields with their respective default values: ``` { _config+:: { - namespace: "default", + namespace: "default", - versions+:: { - alertmanager: "v0.14.0", - nodeExporter: "v0.15.2", - kubeStateMetrics: "v1.3.0", - kubeRbacProxy: "v0.3.0", - addonResizer: "1.0", - prometheusOperator: "v0.18.1", - prometheus: "v2.2.1", - }, + versions+:: { + alertmanager: "v0.15.0", + nodeExporter: "v0.15.2", + kubeStateMetrics: "v1.3.1", + kubeRbacProxy: "v0.3.1", + addonResizer: "1.0", + prometheusOperator: "v0.18.1", + prometheus: "v2.2.1", + }, - imageRepos+:: { - prometheus: "quay.io/prometheus/prometheus", - alertmanager: "quay.io/prometheus/alertmanager", - kubeStateMetrics: "quay.io/coreos/kube-state-metrics", - kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - addonResizer: "quay.io/coreos/addon-resizer", - nodeExporter: "quay.io/prometheus/node-exporter", - prometheusOperator: "quay.io/coreos/prometheus-operator", - }, + imageRepos+:: { + prometheus: "quay.io/prometheus/prometheus", + alertmanager: "quay.io/prometheus/alertmanager", + kubeStateMetrics: "quay.io/coreos/kube-state-metrics", + kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", + addonResizer: "quay.io/coreos/addon-resizer", + nodeExporter: "quay.io/prometheus/node-exporter", + prometheusOperator: "quay.io/coreos/prometheus-operator", + }, - prometheus+:: { - replicas: 2, - rules: {}, - }, + prometheus+:: { + names: 'k8s', + replicas: 2, + rules: {}, + }, - alertmanager+:: { - config: alertmanagerConfig, - replicas: 3, - }, + alertmanager+:: { + name: 'main', + config: ||| + global: + resolve_timeout: 5m + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' + receivers: + - name: 'null' + |||, + replicas: 3, + }, + + kubeStateMetrics+:: { + collectors: '', // empty string gets a default set + scrapeInterval: '30s', + scrapeTimeout: '30s', + + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', + }, }, } ``` -The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same file. F.e. to allow anonymous access to grafana, add the `_config` section: - +The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `_config` field. For example to allow anonymous access to grafana, add the following `_config` section: ``` grafana+:: { config: { @@ -223,8 +257,7 @@ The grafana definition is located in a different project (https://github.com/bra }, ``` - -## Customization +## Customization Examples Jsonnet is a turing complete language, any logic can be reflected in it. It also has powerful merge functionalities, allowing sophisticated customizations of any kind simply by merging it into the object the library provides. From d009846be9c978b300d91ab35bc94e8d634fe261 Mon Sep 17 00:00:00 2001 From: Vasilis Remmas Date: Wed, 1 Aug 2018 19:40:49 +0200 Subject: [PATCH 359/638] Create rolebinding using $._config.namespace Assign the rolebinding to an existing serviceaccount --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 026acad0..c993d7af 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -66,7 +66,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: namespace }]); + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]); local roleBindigList = k.rbac.v1.roleBindingList; roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.namespaces]), From fa32316597b24401f5794566b386e92c6dcaf9d4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 1 Aug 2018 22:51:16 +0200 Subject: [PATCH 360/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 4 +- manifests/grafana-dashboardDefinitions.yaml | 439 ++++++++++++++++-- manifests/prometheus-roleBindingDefault.yaml | 13 - .../prometheus-roleBindingKubeSystem.yaml | 13 - .../prometheus-roleBindingNamespace.yaml | 13 - ...metheus-roleBindingSpecificNamespaces.yaml | 42 ++ manifests/prometheus-roleDefault.yaml | 17 - manifests/prometheus-roleKubeSystem.yaml | 17 - manifests/prometheus-roleNamespace.yaml | 17 - .../prometheus-roleSpecificNamespaces.yaml | 54 +++ 10 files changed, 494 insertions(+), 135 deletions(-) delete mode 100644 manifests/prometheus-roleBindingDefault.yaml delete mode 100644 manifests/prometheus-roleBindingKubeSystem.yaml delete mode 100644 manifests/prometheus-roleBindingNamespace.yaml create mode 100644 manifests/prometheus-roleBindingSpecificNamespaces.yaml delete mode 100644 manifests/prometheus-roleDefault.yaml delete mode 100644 manifests/prometheus-roleKubeSystem.yaml delete mode 100644 manifests/prometheus-roleNamespace.yaml create mode 100644 manifests/prometheus-roleSpecificNamespaces.yaml diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 64e89c9b..6bc1aaa1 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "d7afb094898de5817f3cb49807dee44513d6d121" + "version": "87358f68efee390c30c7d26ea98faeee331a177a" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "2644ba2e0002715d0a079b5e1214128f72cbcc3b" + "version": "3c341913ddd3882c8f1edc1c20accdbcaaf10525" }, { "name": "grafonnet", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e3fa61c7..af2b2b0a 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1915,12 +1915,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": ":node_cpu_utilisation:avg1m", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -1930,7 +1930,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "CPU Requests Commitment", + "title": "CPU Utilisation", "tooltip": { "shared": true, "sort": 0, @@ -1999,12 +1999,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2014,7 +2014,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "CPU Limits Commitment", + "title": "CPU Requests Commitment", "tooltip": { "shared": true, "sort": 0, @@ -2083,12 +2083,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 2, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2098,7 +2098,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Requests Commitment", + "title": "CPU Limits Commitment", "tooltip": { "shared": true, "sort": 0, @@ -2167,7 +2167,175 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": ":node_memory_utilisation:", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, "stack": false, "steppedLine": false, "targets": [ @@ -2238,7 +2406,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 4, + "id": 6, "legend": { "avg": false, "current": false, @@ -2336,7 +2504,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 7, "legend": { "avg": false, "current": false, @@ -2603,7 +2771,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 8, "legend": { "avg": false, "current": false, @@ -2701,7 +2869,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 9, "legend": { "avg": false, "current": false, @@ -4792,11 +4960,25 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - (avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[5m])))\n", + "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", - "intervalFactor": 10, - "legendFormat": "{{cpu}}", + "intervalFactor": 2, + "legendFormat": "load 1m", "refId": "A" + }, + { + "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 5m", + "refId": "B" + }, + { + "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 15m", + "refId": "C" } ], "thresholds": [ @@ -4804,7 +4986,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Idle CPU", + "title": "System load", "tooltip": { "shared": true, "sort": 0, @@ -4825,16 +5007,16 @@ items: "format": "percentunit", "label": null, "logBase": 1, - "max": 1, - "min": 0, + "max": null, + "min": null, "show": true }, { "format": "percentunit", "label": null, "logBase": 1, - "max": 1, - "min": 0, + "max": null, + "min": null, "show": true } ] @@ -4883,25 +5065,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", "format": "time_series", "intervalFactor": 2, - "legendFormat": "load 1m", + "legendFormat": "{{cpu}}", "refId": "A" - }, - { - "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 5m", - "refId": "B" - }, - { - "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 15m", - "refId": "C" } ], "thresholds": [ @@ -4971,6 +5139,191 @@ items: }, "id": 4, + "legend": { + "alignAsTable": "true", + "avg": "true", + "current": "true", + "max": "false", + "min": "false", + "rightSide": "true", + "show": "true", + "total": "false", + "values": "true" + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "format": "time_series", + "intervalFactor": 10, + "legendFormat": "{{ cpu }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilizaion", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "CPU Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -5091,7 +5444,7 @@ items: "gridPos": { }, - "id": 5, + "id": 7, "interval": null, "links": [ @@ -5176,7 +5529,7 @@ items: "gridPos": { }, - "id": 6, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -5297,7 +5650,7 @@ items: "gridPos": { }, - "id": 7, + "id": 9, "interval": null, "links": [ @@ -5382,7 +5735,7 @@ items: "gridPos": { }, - "id": 8, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -5473,7 +5826,7 @@ items: "gridPos": { }, - "id": 9, + "id": 11, "legend": { "alignAsTable": false, "avg": false, diff --git a/manifests/prometheus-roleBindingDefault.yaml b/manifests/prometheus-roleBindingDefault.yaml deleted file mode 100644 index c4039710..00000000 --- a/manifests/prometheus-roleBindingDefault.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus-roleBindingKubeSystem.yaml b/manifests/prometheus-roleBindingKubeSystem.yaml deleted file mode 100644 index 250c7307..00000000 --- a/manifests/prometheus-roleBindingKubeSystem.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: kube-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus-roleBindingNamespace.yaml b/manifests/prometheus-roleBindingNamespace.yaml deleted file mode 100644 index 068c77d3..00000000 --- a/manifests/prometheus-roleBindingNamespace.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: monitoring -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml new file mode 100644 index 00000000..c7527f6a --- /dev/null +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -0,0 +1,42 @@ +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: default + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: kube-system + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: monitoring + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +kind: RoleBindingList diff --git a/manifests/prometheus-roleDefault.yaml b/manifests/prometheus-roleDefault.yaml deleted file mode 100644 index 1c336117..00000000 --- a/manifests/prometheus-roleDefault.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: prometheus-k8s - namespace: default -rules: -- apiGroups: - - "" - resources: - - nodes - - services - - endpoints - - pods - verbs: - - get - - list - - watch diff --git a/manifests/prometheus-roleKubeSystem.yaml b/manifests/prometheus-roleKubeSystem.yaml deleted file mode 100644 index d82fe3ab..00000000 --- a/manifests/prometheus-roleKubeSystem.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: prometheus-k8s - namespace: kube-system -rules: -- apiGroups: - - "" - resources: - - nodes - - services - - endpoints - - pods - verbs: - - get - - list - - watch diff --git a/manifests/prometheus-roleNamespace.yaml b/manifests/prometheus-roleNamespace.yaml deleted file mode 100644 index 343cfc6d..00000000 --- a/manifests/prometheus-roleNamespace.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: prometheus-k8s - namespace: monitoring -rules: -- apiGroups: - - "" - resources: - - nodes - - services - - endpoints - - pods - verbs: - - get - - list - - watch diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml new file mode 100644 index 00000000..b305774a --- /dev/null +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: default + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: kube-system + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: monitoring + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +kind: RoleList From 7ac4da1f9fc526bd7e00f12057f733be04976db2 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Fri, 3 Aug 2018 08:42:50 -0500 Subject: [PATCH 361/638] kube-prometheus: add more commentary to etcd.jsonnet --- examples/etcd.jsonnet | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index fcce57be..cb864f97 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -5,9 +5,14 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + // Reference info: https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/README.md#static-etcd-configuration etcd+:: { - // Configure this to be the IP(s) to scrape - i.e. your etcd node(s) (use commans to separate multiple values). + // Configure this to be the IP(s) to scrape - i.e. your etcd node(s) (use commas to separate multiple values). ips: ['127.0.0.1'], + // Reference info: + // * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#servicemonitorspec (has endpoints) + // * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#endpoint (has tlsConfig) + // * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#tlsconfig (has: caFile, certFile, keyFile, serverName, & insecureSkipVerify) + // Set these three variables to values that are valid to scrape etcd metrics with (check the apiserver container). // Most likely these certificates are generated somewhere in an infrastructure repository, so using the jsonnet `importstr` function can // be useful here. (Kube-aws stores these three files inside the credential folder.) @@ -16,7 +21,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + clientKey: importstr '/path-on-your-work-machine/etcd-client.key', clientCert: importstr '/path-on-your-work-machine/etcd-client.crt', - // A valid name for the certificate + // A valid name for the certificate. serverName: 'etcd.my-cluster.local', // TODO: enhance kube-prometheus-static-etcd.libsonnet to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). From c438b23991c74b9aab8faa898e74285f2419e002 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Fri, 3 Aug 2018 08:50:29 -0500 Subject: [PATCH 362/638] kube-prometheus: another etcd.jsonnet commentary tweak --- examples/etcd.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index cb864f97..cadf1b9e 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -21,7 +21,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + clientKey: importstr '/path-on-your-work-machine/etcd-client.key', clientCert: importstr '/path-on-your-work-machine/etcd-client.crt', - // A valid name for the certificate. + // A valid name (DNS or Subject Alternative Name) for the etcd certificate. serverName: 'etcd.my-cluster.local', // TODO: enhance kube-prometheus-static-etcd.libsonnet to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). From 0f76dc63b6c7fb6c4828bfb08f44e82c28404f77 Mon Sep 17 00:00:00 2001 From: Robert Nemeti Date: Wed, 1 Aug 2018 13:34:37 +0000 Subject: [PATCH 363/638] add serviceMonitorNamespaceSelector to the prometheus jsonnet library to implement the feature from PR #1227 --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index c993d7af..3d8bea3e 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -167,6 +167,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; baseImage: $._config.imageRepos.prometheus, serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: selector.withMatchExpressions({ key: 'k8s-app', operator: 'Exists' }), + serviceMonitorNamespaceSelector: selector.withMatchExpressions({ key: 'prometheus', operator: 'In', values: ['yes', 'true'] }), nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, ruleSelector: selector.withMatchLabels({ role: 'alert-rules', From 43dc7999cac2fdecd7f76b9e5dd51dd8f89cc8c7 Mon Sep 17 00:00:00 2001 From: Robert Nemeti Date: Mon, 6 Aug 2018 08:19:28 +0000 Subject: [PATCH 364/638] provide empty hash to serviceMonitorNamespaceSelector, which selects all the namespaces by default --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 3d8bea3e..a696d0b6 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -167,7 +167,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; baseImage: $._config.imageRepos.prometheus, serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: selector.withMatchExpressions({ key: 'k8s-app', operator: 'Exists' }), - serviceMonitorNamespaceSelector: selector.withMatchExpressions({ key: 'prometheus', operator: 'In', values: ['yes', 'true'] }), + serviceMonitorNamespaceSelector: selector.withMatchExpressions({}), nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, ruleSelector: selector.withMatchLabels({ role: 'alert-rules', From 7213f93e41de59d6d9f391f895c1826964234710 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Mon, 6 Aug 2018 14:11:42 +0200 Subject: [PATCH 365/638] *: cut 0.23.0 --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index edbf70e5..35b98e11 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -51,4 +51,4 @@ "version": "master" } ] -} \ No newline at end of file +} From ce2ead0e04afd3d4a42ccb3b6fc07c11ab7321c0 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Mon, 6 Aug 2018 12:40:45 -0500 Subject: [PATCH 366/638] kube-prometheus: remove mention of ksonnet example --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index daa0619c..498f71b7 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ $ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 - This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). -Though for a quickstart a compiled version of the Kubernetes manifests generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: +Though for a quickstart a compiled version of the Kubernetes [manifests](manifests) generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: * Simply create the stack: ``` $ kubectl create -f manifests/ || true @@ -76,7 +76,9 @@ $ kubectl delete -f manifests/ || true ## Customizing Kube-Prometheus -This section describes how to customize the kube-prometheus library. +This section: + * describes how to customize the kube-prometheus library via compiling the kube-prometheus manifests yourself (as an alternative to the [Quickstart section](#Quickstart)). + * still doesn't require you to make a copy of this entire repository, but rather only a copy of a few select files. ### Installing @@ -99,8 +101,7 @@ In order to update the kube-prometheus dependency, simply use the jsonnet-bundle ### Compiling -You may wish to not use ksonnet/jsonnet and simply render the generated manifests to files on disk (in which case you still don't need a copy of this entire repository, but rather only a copy of a few select files). -This can be done e.g. by running `./build.sh example.jsonnet`. +e.g. of how to compile the manifests: `./build.sh example.jsonnet` Here's [example.jsonnet](example.jsonnet): @@ -121,7 +122,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` -And here's the [build.sh](build.sh) script (this renders all manifests in a json structure of `{filename: manifest-content}`): +And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): [embedmd]:# (build.sh) ```sh From 3380c391810b1429e80038409d1cce585e62e1ce Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Mon, 6 Aug 2018 12:41:34 -0500 Subject: [PATCH 367/638] kube-prometheus: revise commentary about serverName and insecureSkipVerify --- examples/etcd.jsonnet | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index cadf1b9e..871f5747 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -21,11 +21,12 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + clientKey: importstr '/path-on-your-work-machine/etcd-client.key', clientCert: importstr '/path-on-your-work-machine/etcd-client.crt', - // A valid name (DNS or Subject Alternative Name) for the etcd certificate. - serverName: 'etcd.my-cluster.local', + // A valid name (DNS or Subject Alternative Name) that the client (i.e. prometheus) will use to verify the etcd TLS certificate. + serverName: 'etcd.my-cluster.local', // a real-life e.g. value is "etcd.kube-system.svc.cluster.local" - // TODO: enhance kube-prometheus-static-etcd.libsonnet to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). - // Note that insecureSkipVerify is only to be used if you cannot use a Subject Alternative Name. + // TODO: prometheus-operator issue #1755 ("kube-prometheus: re-introduce insecureSkipVerify for etcd monitoring") will enhance kube-prometheus-static-etcd.libsonnet + // to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). + // Note that once that issue #1755 is resolved, insecureSkipVerify is only to be used if you cannot use a Subject Alternative Name. // In case you have generated the etcd certificate with kube-aws: // * If you only have one etcd node, you can use the value from 'etcd.internalDomainName' (specified in your kube-aws cluster.yaml) as the value for 'serverName'. From 6574cd58aa14e90a9ee32fe6ea83c742040c35ca Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Mon, 6 Aug 2018 12:48:19 -0500 Subject: [PATCH 368/638] kube-prometheus: change the 3 etcd client cert files back to files that actually exist in this repo In order to resolve failed test: https://travis-ci.org/coreos/prometheus-operator/jobs/411752894 --- examples/etcd.jsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index 871f5747..c8687c89 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -13,13 +13,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + // * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#endpoint (has tlsConfig) // * https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#tlsconfig (has: caFile, certFile, keyFile, serverName, & insecureSkipVerify) - // Set these three variables to values that are valid to scrape etcd metrics with (check the apiserver container). + // Set these three variables to the fully qualified directory path on your work machine to the certificate files that are valid to scrape etcd metrics with (check the apiserver container). // Most likely these certificates are generated somewhere in an infrastructure repository, so using the jsonnet `importstr` function can // be useful here. (Kube-aws stores these three files inside the credential folder.) // All the sensitive information on the certificates will end up in a Kubernetes Secret. - clientCA: importstr '/path-on-your-work-machine/etcd-client-ca.crt', - clientKey: importstr '/path-on-your-work-machine/etcd-client.key', - clientCert: importstr '/path-on-your-work-machine/etcd-client.crt', + clientCA: importstr 'etcd-client-ca.crt', + clientKey: importstr 'etcd-client.key', + clientCert: importstr 'etcd-client.crt', // A valid name (DNS or Subject Alternative Name) that the client (i.e. prometheus) will use to verify the etcd TLS certificate. serverName: 'etcd.my-cluster.local', // a real-life e.g. value is "etcd.kube-system.svc.cluster.local" From 28e3ef5a6d2cd341f940c606f27d2ce7df42a7ef Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Tue, 7 Aug 2018 17:40:34 +0200 Subject: [PATCH 369/638] contrib/kube-prometheus: bump prometheus-operator This commit bumps the version of the Prometheus Operator jsonnet dependency in kube-prometheus. With this change, kube-prometheus now supports Prometheus Operator v0.23.0. --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.lock.json | 6 +++--- manifests/prometheus-prometheus.yaml | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 35b98e11..1ad3ceb3 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.22.2" + "version": "v0.23.0" }, { "name": "etcd-mixin", diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6bc1aaa1..4ffa3a3d 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "87358f68efee390c30c7d26ea98faeee331a177a" + "version": "f86bcb58b967bb41781976a2b25906fd536d3aa4" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "42f72ac60fad1022d26f1a88062689e94219d582" + "version": "a17504d81e2ab5b29d97c77faf72b8d7644bbabf" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "93be31d43a2728d1750120aeae7a483698ccead2" + "version": "9ad8f4c350368d42d988dd8c05801e40f597d369" } ] } \ No newline at end of file diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 9a7448b6..45ce2dab 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -23,6 +23,9 @@ spec: prometheus: k8s role: alert-rules serviceAccountName: prometheus-k8s + serviceMonitorNamespaceSelector: + matchExpressions: + - {} serviceMonitorSelector: matchExpressions: - key: k8s-app From ad673494e73f007a034c7c103160edc19335d0cb Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 8 Aug 2018 07:55:59 -0500 Subject: [PATCH 370/638] Add more etcd TLS commentary Per the outcome of issue #1755 & PR #1756 --- examples/etcd.jsonnet | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index c8687c89..e26c9571 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -21,12 +21,20 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + clientKey: importstr 'etcd-client.key', clientCert: importstr 'etcd-client.crt', - // A valid name (DNS or Subject Alternative Name) that the client (i.e. prometheus) will use to verify the etcd TLS certificate. - serverName: 'etcd.my-cluster.local', // a real-life e.g. value is "etcd.kube-system.svc.cluster.local" + // Note that you should specify a value EITHER for 'serverName' OR for 'insecureSkipVerify'. (Don't specify a value for both of them, and don't specify a value for neither of them.) + // * Specifying serverName: Ideally you should provide a valid value for serverName (and then insecureSkipVerify should be left as false - so that serverName gets used). + // * Specifying insecureSkipVerify: insecureSkipVerify is only to be used (i.e. set to true) if you cannot (based on how your etcd certificates were created) use a Subject Alternative Name. + // * If you specify a value: + // ** for both of these variables: When 'insecureSkipVerify: true' is specified, then also specifying a value for serverName won't hurt anything but it will be ignored. + // ** for neither of these variables: then you'll get authentication errors on the prom '/targets' page with your etcd targets. - // TODO: prometheus-operator issue #1755 ("kube-prometheus: re-introduce insecureSkipVerify for etcd monitoring") will enhance kube-prometheus-static-etcd.libsonnet - // to allow 'insecureSkipVerify: true' to be specified here (as an alternative to specifying a value for 'serverName'). - // Note that once that issue #1755 is resolved, insecureSkipVerify is only to be used if you cannot use a Subject Alternative Name. + // A valid name (DNS or Subject Alternative Name) that the client (i.e. prometheus) will use to verify the etcd TLS certificate. + // * Note that doing `nslookup etcd.kube-system.svc.cluster.local` (on a pod in a K8s cluster where kube-prometheus has been installed) shows that kube-prometheus sets up this hostname. + // * `openssl x509 -noout -text -in etcd-client.pem` will print the Subject Alternative Names. + serverName: 'etcd.kube-system.svc.cluster.local', + + // When insecureSkipVerify isn't specified, the default value is "false". + //insecureSkipVerify: true, // In case you have generated the etcd certificate with kube-aws: // * If you only have one etcd node, you can use the value from 'etcd.internalDomainName' (specified in your kube-aws cluster.yaml) as the value for 'serverName'. From 25bb07fe7d72c8422c9c278595bc890742ba4550 Mon Sep 17 00:00:00 2001 From: Henrique Fernandes Date: Tue, 7 Aug 2018 10:39:12 -0300 Subject: [PATCH 371/638] Fix typos --- jsonnet/kube-prometheus/alerts/prometheus.libsonnet | 2 +- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet index 32d8262b..99be08ff 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -131,7 +131,7 @@ }, }, { - alert: 'PrometheusTargetScapesDuplicate', + alert: 'PrometheusTargetScrapesDuplicate', annotations: { description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', summary: 'Prometheus has many samples rejected', diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 5af7d2fa..695bcb0d 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -849,7 +849,7 @@ spec: for: 10m labels: severity: warning - - alert: PrometheusTargetScapesDuplicate + - alert: PrometheusTargetScrapesDuplicate annotations: description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values' From 2c27ec76a18daaa932207de2080b680d2acd77fd Mon Sep 17 00:00:00 2001 From: Henrique Fernandes Date: Wed, 8 Aug 2018 10:26:27 -0300 Subject: [PATCH 372/638] Bump jsonnet dependencies --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 4ffa3a3d..06057a46 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "f86bcb58b967bb41781976a2b25906fd536d3aa4" + "version": "01491ecb1b47620e19bf59fdc104add04a4b4ac8" }, { "name": "ksonnet", From ff878498df2359f08c45f1bacd08b84e357bd5ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 8 Aug 2018 16:39:16 +0200 Subject: [PATCH 373/638] kube-promethes: Fix default serviceMonitorSelector --- .../prometheus/prometheus.libsonnet | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index a696d0b6..2f46190c 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -21,7 +21,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: 2, rules: {}, renderedRules: {}, - namespaces: ["default", "kube-system",$._config.namespace], + namespaces: ['default', 'kube-system', $._config.namespace], }, }, @@ -59,7 +59,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBindingSpecificNamespaces: local roleBinding = k.rbac.v1.roleBinding; - local newSpecificRoleBinding(namespace) = + local newSpecificRoleBinding(namespace) = roleBinding.new() + roleBinding.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.metadata.withNamespace(namespace) + @@ -67,7 +67,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; roleBinding.mixin.roleRef.withName('prometheus-' + $._config.prometheus.name) + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + $._config.prometheus.name, namespace: $._config.namespace }]); - + local roleBindigList = k.rbac.v1.roleBindingList; roleBindigList.new([newSpecificRoleBinding(x) for x in $._config.prometheus.namespaces]), clusterRole: @@ -134,13 +134,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'pods', ]) + policyRule.withVerbs(['get', 'list', 'watch']); - - local newSpecificRole(namespace) = + + local newSpecificRole(namespace) = role.new() + role.mixin.metadata.withName('prometheus-' + $._config.prometheus.name) + role.mixin.metadata.withNamespace(namespace) + role.withRules(coreRule); - + local roleList = k.rbac.v1.roleList; roleList.new([newSpecificRole(x) for x in $._config.prometheus.namespaces]), prometheus: @@ -166,8 +166,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; version: $._config.versions.prometheus, baseImage: $._config.imageRepos.prometheus, serviceAccountName: 'prometheus-' + $._config.prometheus.name, - serviceMonitorSelector: selector.withMatchExpressions({ key: 'k8s-app', operator: 'Exists' }), - serviceMonitorNamespaceSelector: selector.withMatchExpressions({}), + serviceMonitorSelector: {}, + serviceMonitorNamespaceSelector: {}, nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, ruleSelector: selector.withMatchLabels({ role: 'alert-rules', From 973fc19fa147ee31279c120894fecb79002c5b58 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 8 Aug 2018 16:40:18 +0200 Subject: [PATCH 374/638] kube-prometheus: Bump kube-prometheus jsonnet dep --- jsonnetfile.lock.json | 8 ++++---- manifests/prometheus-prometheus.yaml | 9 ++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 06057a46..7b527052 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "01491ecb1b47620e19bf59fdc104add04a4b4ac8" + "version": "3dc6ff1b6a3b3efe8152bdaddd8a41fb2f6cbf37" }, { "name": "ksonnet", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "8f131a315dfe877819c3a490eedfbcfe183b95cf" + "version": "942cd2349e27c2510c71158ee9ca953df33724f2" }, { "name": "prometheus-operator", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "87000af76132515a1a2721e836c282d97f82593b" + "version": "82ac49106139eee53f1d76e062782f2e8449dd45" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "9ad8f4c350368d42d988dd8c05801e40f597d369" + "version": "6c9a853f04f8e0cde6139f3a9d04d00517407b91" } ] } \ No newline at end of file diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 45ce2dab..484c33b4 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -23,11 +23,6 @@ spec: prometheus: k8s role: alert-rules serviceAccountName: prometheus-k8s - serviceMonitorNamespaceSelector: - matchExpressions: - - {} - serviceMonitorSelector: - matchExpressions: - - key: k8s-app - operator: Exists + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} version: v2.3.1 From 159eb0549738615ea78539457c5545c2edfd1f6c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 8 Aug 2018 17:43:29 +0200 Subject: [PATCH 375/638] kube-prometheus: Update dependencies --- jsonnetfile.lock.json | 2 +- ...0alertmanagerCustomResourceDefinition.yaml | 14 ++- ...r-0prometheusCustomResourceDefinition.yaml | 118 ++++++++++++++++-- ...ervicemonitorCustomResourceDefinition.yaml | 2 +- .../0prometheus-operator-deployment.yaml | 12 +- .../0prometheus-operator-serviceMonitor.yaml | 3 +- manifests/grafana-dashboardSources.yaml | 25 ++-- 7 files changed, 146 insertions(+), 30 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 7b527052..5951577c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3dc6ff1b6a3b3efe8152bdaddd8a41fb2f6cbf37" + "version": "793d90134afffc41c07d0482794379962f3e14ec" }, { "name": "ksonnet", diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 6f30397a..6ee94918 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Alertmanager - cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerSpec is a specification of the desired behavior + of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: affinity: description: Affinity is a group of affinity scheduling rules. @@ -1687,6 +1687,10 @@ spec: to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object + retention: + description: Time duration Alertmanager shall retain data for. Default + is '120h'. + type: string routePrefix: description: The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP @@ -2372,9 +2376,9 @@ spec: description: Version the cluster should be on. type: string status: - description: 'Most recent observed status of the Alertmanager cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerStatus is the most recent observed status of the + Alertmanager cluster. Read-only. Not included when requesting from the + apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 140deffa..6eba60db 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Prometheus cluster. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusSpec is a specification of the desired behavior + of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: additionalAlertManagerConfigs: description: SecretKeySelector selects a key of a Secret. @@ -671,6 +671,76 @@ spec: type: array required: - alertmanagers + apiserverConfig: + description: 'APIServerConfig defines a host and auth methods to access + apiserver. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config' + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic + authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: Bearer token for accessing apiserver. + type: string + bearerTokenFile: + description: File to read bearer token for accessing apiserver. + type: string + host: + description: Host of apiserver. A valid string consisting of a hostname + or IP followed by an optional port number + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + required: + - host baseImage: description: Base image to use for a Prometheus deployment. type: string @@ -2024,7 +2094,8 @@ spec: value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object retention: - description: Time duration Prometheus shall retain data for. + description: Time duration Prometheus shall retain data for. Default + is '24h'. type: string routePrefix: description: The route prefix Prometheus registers HTTP handlers for. @@ -2858,11 +2929,41 @@ spec: description: Google Cloud Storage bucket name for stored blocks. If empty it won't store any block inside Google Cloud Storage. type: string + credentials: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object s3: - description: ThanosSpec defines parameters for of AWS Simple Storage + description: ThanosS3Spec defines parameters for of AWS Simple Storage Service (S3) with Thanos. (S3 compatible services apply as well) properties: accessKey: @@ -2884,6 +2985,9 @@ spec: bucket: description: S3-Compatible API bucket name for stored blocks. type: string + encryptsse: + description: Whether to use Server Side Encryption + type: boolean endpoint: description: S3-Compatible API endpoint for stored blocks. type: string @@ -2961,9 +3065,9 @@ spec: description: Version of Prometheus to be deployed. type: string status: - description: 'Most recent observed status of the Prometheus cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusStatus is the most recent observed status of the + Prometheus cluster. Read-only. Not included when requesting from the apiserver, + only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index f3068cf8..9d96bfeb 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -169,7 +169,7 @@ spec: description: The label to use to retrieve the job name from. type: string namespaceSelector: - description: A selector for selecting namespaces either selecting all + description: NamespaceSelector is a selector for selecting either all namespaces or a list of namespaces. properties: any: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 358fb6e2..020b54a2 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,9 +18,10 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet + - -logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.2 - image: quay.io/coreos/prometheus-operator:v0.22.2 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.0 + image: quay.io/coreos/prometheus-operator:v0.23.0 name: prometheus-operator ports: - containerPort: 8080 @@ -28,10 +29,13 @@ spec: resources: limits: cpu: 200m - memory: 100Mi + memory: 200Mi requests: cpu: 100m - memory: 50Mi + memory: 100Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true nodeSelector: beta.kubernetes.io/os: linux securityContext: diff --git a/manifests/0prometheus-operator-serviceMonitor.yaml b/manifests/0prometheus-operator-serviceMonitor.yaml index 10e0059a..14f402fb 100644 --- a/manifests/0prometheus-operator-serviceMonitor.yaml +++ b/manifests/0prometheus-operator-serviceMonitor.yaml @@ -7,7 +7,8 @@ metadata: namespace: monitoring spec: endpoints: - - port: http + - honorLabels: true + port: http selector: matchLabels: k8s-app: prometheus-operator diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml index 61fdcf61..d8b401a7 100644 --- a/manifests/grafana-dashboardSources.yaml +++ b/manifests/grafana-dashboardSources.yaml @@ -1,17 +1,20 @@ apiVersion: v1 data: dashboards.yaml: |- - [ - { - "folder": "", - "name": "0", - "options": { - "path": "/grafana-dashboard-definitions/0" - }, - "org_id": 1, - "type": "file" - } - ] + { + "apiVersion": 1, + "providers": [ + { + "folder": "", + "name": "0", + "options": { + "path": "/grafana-dashboard-definitions/0" + }, + "orgId": 1, + "type": "file" + } + ] + } kind: ConfigMap metadata: name: grafana-dashboards From eb23d1d16ca56a400de061fc210fd62e7b2b0f03 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 13 Aug 2018 16:23:51 +0200 Subject: [PATCH 376/638] kube-prometheus: Add docs on monitoring additional namespaces --- README.md | 26 ++++++++++++++++++++++++++ examples/additional-namespaces.jsonnet | 17 +++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 examples/additional-namespaces.jsonnet diff --git a/README.md b/README.md index b653dc29..84aeb1d3 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,32 @@ In the above example the configuration has been inlined, but can just as well be }, }).alertmanager.secret ``` + +### Adding additional namespaces to monitor + +In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$._config.namespace`. This is specified in `$._config.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces: + +[embedmd]:# (examples/additional-namespaces.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + prometheus+:: { + namespaces+: ['my-namespace', 'my-second-namespace'], + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + ### Static etcd configuration In order to configure a static etcd cluster to scrape there is a simple mixin prepared, so only the IPs and certificate information need to be configured. Simply append the `kube-prometheus/kube-prometheus-static-etcd.libsonnet` mixin to the rest of the configuration, and configure the `ips` to be the IPs to scrape, and the `clientCA`, `clientKey` and `clientCert` to values that are valid to scrape etcd metrics with. diff --git a/examples/additional-namespaces.jsonnet b/examples/additional-namespaces.jsonnet new file mode 100644 index 00000000..957fd912 --- /dev/null +++ b/examples/additional-namespaces.jsonnet @@ -0,0 +1,17 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + prometheus+:: { + namespaces+: ['my-namespace', 'my-second-namespace'], + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } From a559123a2b22a15443b9d13d84e1ab04f6bae6fa Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 6 Aug 2018 21:16:59 +0200 Subject: [PATCH 377/638] kube-prometheus: Allow skipping etcd TLS errors --- examples/etcd-skip-verify.jsonnet | 22 +++++++++++++++++++ .../kube-prometheus-static-etcd.libsonnet | 4 +++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 examples/etcd-skip-verify.jsonnet diff --git a/examples/etcd-skip-verify.jsonnet b/examples/etcd-skip-verify.jsonnet new file mode 100644 index 00000000..603ba710 --- /dev/null +++ b/examples/etcd-skip-verify.jsonnet @@ -0,0 +1,22 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + etcd+:: { + ips: ['127.0.0.1'], + clientCA: importstr 'etcd-client-ca.crt', + clientKey: importstr 'etcd-client.key', + clientCert: importstr 'etcd-client.crt', + insecureSkipVerify: true, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet index 23883c2c..d63b8680 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet @@ -8,6 +8,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clientKey: null, clientCert: null, serverName: null, + insecureSkipVerify: null, }, }, prometheus+:: { @@ -65,7 +66,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt', keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key', certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt', - serverName: $._config.etcd.serverName, + [if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName, + [if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify, }, }, ], From 7bb87dd4321f444866084841ade27b24a2819118 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 7 Aug 2018 13:56:28 +0200 Subject: [PATCH 378/638] kube-prometheus: Bump versions --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5951577c..0001fe4b 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "793d90134afffc41c07d0482794379962f3e14ec" + "version": "18d040769c2128c292330915d60f267e0d4bb325" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "6c9a853f04f8e0cde6139f3a9d04d00517407b91" + "version": "f87b566248bb0713a56dc55bc545aa5aad17ace0" } ] } \ No newline at end of file From f291ce10053294d2198ecf7c8a59b3e0760d0ab0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 14 Aug 2018 16:34:05 +0200 Subject: [PATCH 379/638] kube-prometheus: bump Prometheus Operator version --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 1ad3ceb3..78e9f61f 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.23.0" + "version": "v0.23.1" }, { "name": "etcd-mixin", From 52e1906ba08c01498cb3eb49b7408cf1187978cc Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 14 Aug 2018 16:35:59 +0200 Subject: [PATCH 380/638] kube-prometheus: Update dependencies and re-generate --- jsonnetfile.lock.json | 6 +++--- manifests/0prometheus-operator-deployment.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 0001fe4b..014191ef 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "18d040769c2128c292330915d60f267e0d4bb325" + "version": "cfb1c0ac6c458401cf47390a35d48b707612af0a" }, { "name": "ksonnet", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "82ac49106139eee53f1d76e062782f2e8449dd45" + "version": "ba92b2f232ef24684b9dc6bde03b74e1630909a6" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "f87b566248bb0713a56dc55bc545aa5aad17ace0" + "version": "3c89938adf2988a2a2f6bb031cb075ae38ae1d0e" } ] } \ No newline at end of file diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 020b54a2..052511f4 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,10 +18,10 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet - - -logtostderr=true + - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.0 - image: quay.io/coreos/prometheus-operator:v0.23.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.1 + image: quay.io/coreos/prometheus-operator:v0.23.1 name: prometheus-operator ports: - containerPort: 8080 From d482cef033b7c1f3afa2db70c8c01864f354093f Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Wed, 15 Aug 2018 17:27:24 +0200 Subject: [PATCH 381/638] Update Prometheus to v2.3.2 and Alertmanager to 0.15.2 --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 2 +- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- jsonnetfile.lock.json | 6 +++--- manifests/alertmanager-alertmanager.yaml | 2 +- manifests/prometheus-prometheus.yaml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 250e7bd7..a9e9c037 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.15.0', + alertmanager: 'v0.15.2', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 2f46190c..63df0506 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.3.1', + prometheus: 'v2.3.2', }, imageRepos+:: { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 014191ef..55291857 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "cfb1c0ac6c458401cf47390a35d48b707612af0a" + "version": "7590ad813c4661efbe13241611961cdb2f4dc5a6" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "a17504d81e2ab5b29d97c77faf72b8d7644bbabf" + "version": "8c4610783991b82ff12e485d24ac4f82d8839743" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "3c89938adf2988a2a2f6bb031cb075ae38ae1d0e" + "version": "2a6bc7d1130d2fa3336af5ffa2bcf82d760c945e" } ] } \ No newline at end of file diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index bdc115b9..e800beac 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -11,4 +11,4 @@ spec: beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main - version: v0.15.0 + version: v0.15.2 diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 484c33b4..37d0e663 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -25,4 +25,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.3.1 + version: v2.3.2 From cbb03d387c3ff2e3f9634a0c75649cdac4be2b9e Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 16 Aug 2018 11:07:36 +0200 Subject: [PATCH 382/638] *.sh: Set sane bash options on shell scripts Adding the following accross the project: ```bash /# exit immediately when a command fails set -e /# only exit with zero if all commands of the pipeline exit successfully set -o pipefail /# error on unset variables +set -u ``` --- experimental/custom-metrics-api/gencerts.sh | 6 ++++++ hack/example-service-monitoring/deploy | 6 ++++++ hack/example-service-monitoring/teardown | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/experimental/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh index b1e16031..a8f5539d 100755 --- a/experimental/custom-metrics-api/gencerts.sh +++ b/experimental/custom-metrics-api/gencerts.sh @@ -1,4 +1,10 @@ #!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u # Detect if we are on mac or should use GNU base64 options case $(uname) in diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy index 4912dd96..0c7cd7c1 100755 --- a/hack/example-service-monitoring/deploy +++ b/hack/example-service-monitoring/deploy @@ -1,3 +1,9 @@ #!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u kubectl apply -f examples/example-app diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown index 62b546de..1a49f462 100755 --- a/hack/example-service-monitoring/teardown +++ b/hack/example-service-monitoring/teardown @@ -1,3 +1,9 @@ #!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u kubectl delete -f examples/example-app From f80c5b96d3ffcdbf486bc67c9c3286be4c0ebb93 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 16 Aug 2018 11:39:16 +0200 Subject: [PATCH 383/638] kube-prometheus: Bump jsonnetfile.lock.json kube-prometheus reference --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 55291857..176dcf00 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "7590ad813c4661efbe13241611961cdb2f4dc5a6" + "version": "7a050d711ce4c4704bef5f139aacefb2b75f8cf3" }, { "name": "ksonnet", From 106ed842174ba538ac804af8a5779eefa734f998 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Thu, 16 Aug 2018 14:45:34 +0200 Subject: [PATCH 384/638] kube-prometheus: bind mount host rootfs into node exporter Fixes https://github.com/prometheus/node_exporter#474 in the operator Fixes #569 (by making the mount explicit) Signed-off-by: Sergiusz Urbaniak --- .../node-exporter/node-exporter.libsonnet | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index c51347a3..8f647554 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -73,14 +73,26 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'); + local rootVolumeName = 'root'; + local rootVolume = volume.fromHostPath(rootVolumeName, '/root'); + local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root'). + withMountPropagation('HostToContainer'). + withReadOnly(true); + local nodeExporter = container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + container.withArgs([ '--web.listen-address=127.0.0.1:9101', '--path.procfs=/host/proc', '--path.sysfs=/host/sys', + + // The following settings have been taken from + // https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31 + // Once node exporter is being released with those settings, this can be removed. + '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)', + '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$', ]) + - container.withVolumeMounts([procVolumeMount, sysVolumeMount]) + + container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) + container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); @@ -105,7 +117,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; daemonset.mixin.spec.template.spec.withTolerations([masterToleration]) + daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + daemonset.mixin.spec.template.spec.withContainers(c) + - daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume]) + + daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) + daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') + From 893108e49e355f93fb6f21be1a6b68e46d0de3ac Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Fri, 17 Aug 2018 12:33:28 +0200 Subject: [PATCH 385/638] kube-prometheus: update dependencies Signed-off-by: Sergiusz Urbaniak --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 176dcf00..a63fba30 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "7a050d711ce4c4704bef5f139aacefb2b75f8cf3" + "version": "383cee46d2eab2dc477c8f4b1526b4f93f993280" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "2a6bc7d1130d2fa3336af5ffa2bcf82d760c945e" + "version": "6890a9e633b0cdccdeaf65ccda3d84fb0838801f" } ] } \ No newline at end of file From 155378342f1b34f8a6120ca211c69a8b02dfb16c Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Fri, 17 Aug 2018 12:35:00 +0200 Subject: [PATCH 386/638] kube-prometheus: generate manifests Signed-off-by: Sergiusz Urbaniak --- manifests/node-exporter-daemonset.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index f7c9ebb5..e63ae15c 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -19,6 +19,8 @@ spec: - --web.listen-address=127.0.0.1:9101 - --path.procfs=/host/proc - --path.sysfs=/host/sys + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ image: quay.io/prometheus/node-exporter:v0.15.2 name: node-exporter resources: @@ -35,6 +37,10 @@ spec: - mountPath: /host/sys name: sys readOnly: false + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true - args: - --secure-listen-address=:9100 - --upstream=http://127.0.0.1:9101/ @@ -69,3 +75,6 @@ spec: - hostPath: path: /sys name: sys + - hostPath: + path: /root + name: root From da2536398cf5f7b9a6e6e149b8d6f7ab0e4e747a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 20 Aug 2018 21:53:37 +0200 Subject: [PATCH 387/638] kube-prometheus: Update node-exporter to v0.16.0 --- .../node-exporter/node-exporter.libsonnet | 2 +- ...rter-v0.16.0-compatibility-rules.libsonnet | 406 ++++++++++++++++++ .../rules/node-rules.libsonnet | 39 ++ jsonnet/kube-prometheus/rules/rules.libsonnet | 41 +- 4 files changed, 448 insertions(+), 40 deletions(-) create mode 100644 jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet create mode 100644 jsonnet/kube-prometheus/rules/node-rules.libsonnet diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 8f647554..5791fd3b 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - nodeExporter: 'v0.15.2', + nodeExporter: 'v0.16.0', kubeRbacProxy: 'v0.3.1', }, diff --git a/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet b/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet new file mode 100644 index 00000000..f8e5d6d5 --- /dev/null +++ b/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet @@ -0,0 +1,406 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node_exporter-16-bcache', + rules: [ + { + expr: 'node_bcache_cache_read_races', + record: 'node_bcache_cache_read_races_total', + }, + ], + }, + { + name: 'node_exporter-16-buddyinfo', + rules: [ + { + expr: 'node_buddyinfo_blocks', + record: 'node_buddyinfo_count', + }, + ], + }, + { + name: 'node_exporter-16-stat', + rules: [ + { + expr: 'node_boot_time_seconds', + record: 'node_boot_time', + }, + { + expr: 'node_context_switches_total', + record: 'node_context_switches', + }, + { + expr: 'node_forks_total', + record: 'node_forks', + }, + { + expr: 'node_intr_total', + record: 'node_intr', + }, + ], + }, + { + name: 'node_exporter-16-cpu', + rules: [ + { + expr: 'label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)")', + record: 'node_cpu', + }, + ], + }, + { + name: 'node_exporter-16-diskstats', + rules: [ + { + expr: 'node_disk_read_bytes_total', + record: 'node_disk_bytes_read', + }, + { + expr: 'node_disk_written_bytes_total', + record: 'node_disk_bytes_written', + }, + { + expr: 'node_disk_io_time_seconds_total * 1000', + record: 'node_disk_io_time_ms', + }, + { + expr: 'node_disk_io_time_weighted_seconds_total', + record: 'node_disk_io_time_weighted', + }, + { + expr: 'node_disk_reads_completed_total', + record: 'node_disk_reads_completed', + }, + { + expr: 'node_disk_reads_merged_total', + record: 'node_disk_reads_merged', + }, + { + expr: 'node_disk_read_time_seconds_total * 1000', + record: 'node_disk_read_time_ms', + }, + { + expr: 'node_disk_writes_completed_total', + record: 'node_disk_writes_completed', + }, + { + expr: 'node_disk_writes_merged_total', + record: 'node_disk_writes_merged', + }, + { + expr: 'node_disk_write_time_seconds_total * 1000', + record: 'node_disk_write_time_ms', + }, + ], + }, + { + name: 'node_exporter-16-filesystem', + rules: [ + { + expr: 'node_filesystem_free_bytes', + record: 'node_filesystem_free', + }, + { + expr: 'node_filesystem_avail_bytes', + record: 'node_filesystem_avail', + }, + { + expr: 'node_filesystem_size_bytes', + record: 'node_filesystem_size', + }, + ], + }, + { + name: 'node_exporter-16-infiniband', + rules: [ + { + expr: 'node_infiniband_port_data_received_bytes_total', + record: 'node_infiniband_port_data_received_bytes', + }, + { + expr: 'node_infiniband_port_data_transmitted_bytes_total', + record: 'node_infiniband_port_data_transmitted_bytes', + }, + ], + }, + { + name: 'node_exporter-16-interrupts', + rules: [ + { + expr: 'node_interrupts_total', + record: 'node_interrupts', + }, + ], + }, + { + name: 'node_exporter-16-memory', + rules: [ + { + expr: 'node_memory_Active_bytes', + record: 'node_memory_Active', + }, + { + expr: 'node_memory_Active_anon_bytes', + record: 'node_memory_Active_anon', + }, + { + expr: 'node_memory_Active_file_bytes', + record: 'node_memory_Active_file', + }, + { + expr: 'node_memory_AnonHugePages_bytes', + record: 'node_memory_AnonHugePages', + }, + { + expr: 'node_memory_AnonPages_bytes', + record: 'node_memory_AnonPages', + }, + { + expr: 'node_memory_Bounce_bytes', + record: 'node_memory_Bounce', + }, + { + expr: 'node_memory_Buffers_bytes', + record: 'node_memory_Buffers', + }, + { + expr: 'node_memory_Cached_bytes', + record: 'node_memory_Cached', + }, + { + expr: 'node_memory_CommitLimit_bytes', + record: 'node_memory_CommitLimit', + }, + { + expr: 'node_memory_Committed_AS_bytes', + record: 'node_memory_Committed_AS', + }, + { + expr: 'node_memory_DirectMap2M_bytes', + record: 'node_memory_DirectMap2M', + }, + { + expr: 'node_memory_DirectMap4k_bytes', + record: 'node_memory_DirectMap4k', + }, + { + expr: 'node_memory_Dirty_bytes', + record: 'node_memory_Dirty', + }, + { + expr: 'node_memory_HardwareCorrupted_bytes', + record: 'node_memory_HardwareCorrupted', + }, + { + expr: 'node_memory_Hugepagesize_bytes', + record: 'node_memory_Hugepagesize', + }, + { + expr: 'node_memory_Inactive_bytes', + record: 'node_memory_Inactive', + }, + { + expr: 'node_memory_Inactive_anon_bytes', + record: 'node_memory_Inactive_anon', + }, + { + expr: 'node_memory_Inactive_file_bytes', + record: 'node_memory_Inactive_file', + }, + { + expr: 'node_memory_KernelStack_bytes', + record: 'node_memory_KernelStack', + }, + { + expr: 'node_memory_Mapped_bytes', + record: 'node_memory_Mapped', + }, + { + expr: 'node_memory_MemAvailable_bytes', + record: 'node_memory_MemAvailable', + }, + { + expr: 'node_memory_MemFree_bytes', + record: 'node_memory_MemFree', + }, + { + expr: 'node_memory_MemTotal_bytes', + record: 'node_memory_MemTotal', + }, + { + expr: 'node_memory_Mlocked_bytes', + record: 'node_memory_Mlocked', + }, + { + expr: 'node_memory_NFS_Unstable_bytes', + record: 'node_memory_NFS_Unstable', + }, + { + expr: 'node_memory_PageTables_bytes', + record: 'node_memory_PageTables', + }, + { + expr: 'node_memory_Shmem_bytes', + record: 'node_memory_Shmem', + }, + { + expr: 'node_memory_Slab_bytes', + record: 'node_memory_Slab', + }, + { + expr: 'node_memory_SReclaimable_bytes', + record: 'node_memory_SReclaimable', + }, + { + expr: 'node_memory_SUnreclaim_bytes', + record: 'node_memory_SUnreclaim', + }, + { + expr: 'node_memory_SwapCached_bytes', + record: 'node_memory_SwapCached', + }, + { + expr: 'node_memory_SwapFree_bytes', + record: 'node_memory_SwapFree', + }, + { + expr: 'node_memory_SwapTotal_bytes', + record: 'node_memory_SwapTotal', + }, + { + expr: 'node_memory_Unevictable_bytes', + record: 'node_memory_Unevictable', + }, + { + expr: 'node_memory_VmallocChunk_bytes', + record: 'node_memory_VmallocChunk', + }, + { + expr: 'node_memory_VmallocTotal_bytes', + record: 'node_memory_VmallocTotal', + }, + { + expr: 'node_memory_VmallocUsed_bytes', + record: 'node_memory_VmallocUsed', + }, + { + expr: 'node_memory_Writeback_bytes', + record: 'node_memory_Writeback', + }, + { + expr: 'node_memory_WritebackTmp_bytes', + record: 'node_memory_WritebackTmp', + }, + ], + }, + { + name: 'node_exporter-16-network', + rules: [ + { + expr: 'node_network_receive_bytes_total', + record: 'node_network_receive_bytes', + }, + { + expr: 'node_network_receive_compressed_total', + record: 'node_network_receive_compressed', + }, + { + expr: 'node_network_receive_drop_total', + record: 'node_network_receive_drop', + }, + { + expr: 'node_network_receive_errs_total', + record: 'node_network_receive_errs', + }, + { + expr: 'node_network_receive_fifo_total', + record: 'node_network_receive_fifo', + }, + { + expr: 'node_network_receive_frame_total', + record: 'node_network_receive_frame', + }, + { + expr: 'node_network_receive_multicast_total', + record: 'node_network_receive_multicast', + }, + { + expr: 'node_network_receive_packets_total', + record: 'node_network_receive_packets', + }, + { + expr: 'node_network_transmit_bytes_total', + record: 'node_network_transmit_bytes', + }, + { + expr: 'node_network_transmit_compressed_total', + record: 'node_network_transmit_compressed', + }, + { + expr: 'node_network_transmit_drop_total', + record: 'node_network_transmit_drop', + }, + { + expr: 'node_network_transmit_errs_total', + record: 'node_network_transmit_errs', + }, + { + expr: 'node_network_transmit_fifo_total', + record: 'node_network_transmit_fifo', + }, + { + expr: 'node_network_transmit_frame_total', + record: 'node_network_transmit_frame', + }, + { + expr: 'node_network_transmit_multicast_total', + record: 'node_network_transmit_multicast', + }, + { + expr: 'node_network_transmit_packets_total', + record: 'node_network_transmit_packets', + }, + ], + }, + { + name: 'node_exporter-16-nfs', + rules: [ + { + expr: 'node_nfs_connections_total', + record: 'node_nfs_net_connections', + }, + { + expr: 'node_nfs_packets_total', + record: 'node_nfs_net_reads', + }, + { + expr: 'label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version", "(.+)"), "method", "$1", "procedure", "(.+)")', + record: 'node_nfs_procedures', + }, + { + expr: 'node_nfs_rpc_authentication_refreshes_total', + record: 'node_nfs_rpc_authentication_refreshes', + }, + { + expr: 'node_nfs_rpcs_total', + record: 'node_nfs_rpc_operations', + }, + { + expr: 'node_nfs_rpc_retransmissions_total', + record: 'node_nfs_rpc_retransmissions', + }, + ], + }, + { + name: 'node_exporter-16-textfile', + rules: [ + { + expr: 'node_textfile_mtime_seconds', + record: 'node_textfile_mtime', + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/rules/node-rules.libsonnet b/jsonnet/kube-prometheus/rules/node-rules.libsonnet new file mode 100644 index 00000000..ec3a331e --- /dev/null +++ b/jsonnet/kube-prometheus/rules/node-rules.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'kube-prometheus-node-recording.rules', + rules: [ + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + record: 'instance:node_cpu:rate:sum', + }, + { + expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', + record: 'instance:node_filesystem_usage:sum', + }, + { + expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', + record: 'instance:node_network_receive_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', + record: 'instance:node_network_transmit_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', + record: 'instance:node_cpu:ratio', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', + record: 'cluster:node_cpu:sum_rate5m', + }, + { + expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', + record: 'cluster:node_cpu:ratio', + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/rules/rules.libsonnet index ec3a331e..6654e16b 100644 --- a/jsonnet/kube-prometheus/rules/rules.libsonnet +++ b/jsonnet/kube-prometheus/rules/rules.libsonnet @@ -1,39 +1,2 @@ -{ - prometheusRules+:: { - groups+: [ - { - name: 'kube-prometheus-node-recording.rules', - rules: [ - { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', - record: 'instance:node_cpu:rate:sum', - }, - { - expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', - record: 'instance:node_filesystem_usage:sum', - }, - { - expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', - record: 'instance:node_network_receive_bytes:rate:sum', - }, - { - expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', - record: 'instance:node_network_transmit_bytes:rate:sum', - }, - { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', - record: 'instance:node_cpu:ratio', - }, - { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', - record: 'cluster:node_cpu:sum_rate5m', - }, - { - expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', - record: 'cluster:node_cpu:ratio', - }, - ], - }, - ], - }, -} +(import 'node-rules.libsonnet') + +(import 'node-exporter-v0.16.0-compatibility-rules.libsonnet') From 13b747e1ff1256750a7404cfecece61d5a6994ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 21 Aug 2018 16:49:39 +0200 Subject: [PATCH 388/638] kube-prometheus: Upgrade jsonnet dependencies --- jsonnetfile.lock.json | 6 +- manifests/node-exporter-daemonset.yaml | 2 +- manifests/prometheus-rules.yaml | 195 +++++++++++++++++++++++++ 3 files changed, 199 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index a63fba30..431f5a2c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "383cee46d2eab2dc477c8f4b1526b4f93f993280" + "version": "c9631844b82cdce4eac7e05e21543018ea39acfc" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "3c341913ddd3882c8f1edc1c20accdbcaaf10525" + "version": "b46003c21f1eba3b7d9d361b44e77305ccff601e" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "6890a9e633b0cdccdeaf65ccda3d84fb0838801f" + "version": "2921ab670fa646c2cbb3ba53b5c5bc11ea47632f" } ] } \ No newline at end of file diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index e63ae15c..a2669187 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -21,7 +21,7 @@ spec: - --path.sysfs=/host/sys - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ - image: quay.io/prometheus/node-exporter:v0.15.2 + image: quay.io/prometheus/node-exporter:v0.16.0 name: node-exporter resources: limits: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 695bcb0d..bf07b9a4 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -263,6 +263,201 @@ spec: record: cluster:node_cpu:sum_rate5m - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) record: cluster:node_cpu:ratio + - name: node_exporter-16-bcache + rules: + - expr: node_bcache_cache_read_races + record: node_bcache_cache_read_races_total + - name: node_exporter-16-buddyinfo + rules: + - expr: node_buddyinfo_blocks + record: node_buddyinfo_count + - name: node_exporter-16-stat + rules: + - expr: node_boot_time_seconds + record: node_boot_time + - expr: node_context_switches_total + record: node_context_switches + - expr: node_forks_total + record: node_forks + - expr: node_intr_total + record: node_intr + - name: node_exporter-16-cpu + rules: + - expr: label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)") + record: node_cpu + - name: node_exporter-16-diskstats + rules: + - expr: node_disk_read_bytes_total + record: node_disk_bytes_read + - expr: node_disk_written_bytes_total + record: node_disk_bytes_written + - expr: node_disk_io_time_seconds_total * 1000 + record: node_disk_io_time_ms + - expr: node_disk_io_time_weighted_seconds_total + record: node_disk_io_time_weighted + - expr: node_disk_reads_completed_total + record: node_disk_reads_completed + - expr: node_disk_reads_merged_total + record: node_disk_reads_merged + - expr: node_disk_read_time_seconds_total * 1000 + record: node_disk_read_time_ms + - expr: node_disk_writes_completed_total + record: node_disk_writes_completed + - expr: node_disk_writes_merged_total + record: node_disk_writes_merged + - expr: node_disk_write_time_seconds_total * 1000 + record: node_disk_write_time_ms + - name: node_exporter-16-filesystem + rules: + - expr: node_filesystem_free_bytes + record: node_filesystem_free + - expr: node_filesystem_avail_bytes + record: node_filesystem_avail + - expr: node_filesystem_size_bytes + record: node_filesystem_size + - name: node_exporter-16-infiniband + rules: + - expr: node_infiniband_port_data_received_bytes_total + record: node_infiniband_port_data_received_bytes + - expr: node_infiniband_port_data_transmitted_bytes_total + record: node_infiniband_port_data_transmitted_bytes + - name: node_exporter-16-interrupts + rules: + - expr: node_interrupts_total + record: node_interrupts + - name: node_exporter-16-memory + rules: + - expr: node_memory_Active_bytes + record: node_memory_Active + - expr: node_memory_Active_anon_bytes + record: node_memory_Active_anon + - expr: node_memory_Active_file_bytes + record: node_memory_Active_file + - expr: node_memory_AnonHugePages_bytes + record: node_memory_AnonHugePages + - expr: node_memory_AnonPages_bytes + record: node_memory_AnonPages + - expr: node_memory_Bounce_bytes + record: node_memory_Bounce + - expr: node_memory_Buffers_bytes + record: node_memory_Buffers + - expr: node_memory_Cached_bytes + record: node_memory_Cached + - expr: node_memory_CommitLimit_bytes + record: node_memory_CommitLimit + - expr: node_memory_Committed_AS_bytes + record: node_memory_Committed_AS + - expr: node_memory_DirectMap2M_bytes + record: node_memory_DirectMap2M + - expr: node_memory_DirectMap4k_bytes + record: node_memory_DirectMap4k + - expr: node_memory_Dirty_bytes + record: node_memory_Dirty + - expr: node_memory_HardwareCorrupted_bytes + record: node_memory_HardwareCorrupted + - expr: node_memory_Hugepagesize_bytes + record: node_memory_Hugepagesize + - expr: node_memory_Inactive_bytes + record: node_memory_Inactive + - expr: node_memory_Inactive_anon_bytes + record: node_memory_Inactive_anon + - expr: node_memory_Inactive_file_bytes + record: node_memory_Inactive_file + - expr: node_memory_KernelStack_bytes + record: node_memory_KernelStack + - expr: node_memory_Mapped_bytes + record: node_memory_Mapped + - expr: node_memory_MemAvailable_bytes + record: node_memory_MemAvailable + - expr: node_memory_MemFree_bytes + record: node_memory_MemFree + - expr: node_memory_MemTotal_bytes + record: node_memory_MemTotal + - expr: node_memory_Mlocked_bytes + record: node_memory_Mlocked + - expr: node_memory_NFS_Unstable_bytes + record: node_memory_NFS_Unstable + - expr: node_memory_PageTables_bytes + record: node_memory_PageTables + - expr: node_memory_Shmem_bytes + record: node_memory_Shmem + - expr: node_memory_Slab_bytes + record: node_memory_Slab + - expr: node_memory_SReclaimable_bytes + record: node_memory_SReclaimable + - expr: node_memory_SUnreclaim_bytes + record: node_memory_SUnreclaim + - expr: node_memory_SwapCached_bytes + record: node_memory_SwapCached + - expr: node_memory_SwapFree_bytes + record: node_memory_SwapFree + - expr: node_memory_SwapTotal_bytes + record: node_memory_SwapTotal + - expr: node_memory_Unevictable_bytes + record: node_memory_Unevictable + - expr: node_memory_VmallocChunk_bytes + record: node_memory_VmallocChunk + - expr: node_memory_VmallocTotal_bytes + record: node_memory_VmallocTotal + - expr: node_memory_VmallocUsed_bytes + record: node_memory_VmallocUsed + - expr: node_memory_Writeback_bytes + record: node_memory_Writeback + - expr: node_memory_WritebackTmp_bytes + record: node_memory_WritebackTmp + - name: node_exporter-16-network + rules: + - expr: node_network_receive_bytes_total + record: node_network_receive_bytes + - expr: node_network_receive_compressed_total + record: node_network_receive_compressed + - expr: node_network_receive_drop_total + record: node_network_receive_drop + - expr: node_network_receive_errs_total + record: node_network_receive_errs + - expr: node_network_receive_fifo_total + record: node_network_receive_fifo + - expr: node_network_receive_frame_total + record: node_network_receive_frame + - expr: node_network_receive_multicast_total + record: node_network_receive_multicast + - expr: node_network_receive_packets_total + record: node_network_receive_packets + - expr: node_network_transmit_bytes_total + record: node_network_transmit_bytes + - expr: node_network_transmit_compressed_total + record: node_network_transmit_compressed + - expr: node_network_transmit_drop_total + record: node_network_transmit_drop + - expr: node_network_transmit_errs_total + record: node_network_transmit_errs + - expr: node_network_transmit_fifo_total + record: node_network_transmit_fifo + - expr: node_network_transmit_frame_total + record: node_network_transmit_frame + - expr: node_network_transmit_multicast_total + record: node_network_transmit_multicast + - expr: node_network_transmit_packets_total + record: node_network_transmit_packets + - name: node_exporter-16-nfs + rules: + - expr: node_nfs_connections_total + record: node_nfs_net_connections + - expr: node_nfs_packets_total + record: node_nfs_net_reads + - expr: label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version", + "(.+)"), "method", "$1", "procedure", "(.+)") + record: node_nfs_procedures + - expr: node_nfs_rpc_authentication_refreshes_total + record: node_nfs_rpc_authentication_refreshes + - expr: node_nfs_rpcs_total + record: node_nfs_rpc_operations + - expr: node_nfs_rpc_retransmissions_total + record: node_nfs_rpc_retransmissions + - name: node_exporter-16-textfile + rules: + - expr: node_textfile_mtime_seconds + record: node_textfile_mtime - name: kubernetes-absent rules: - alert: AlertmanagerDown From 4099a04cc0a90dd77e29da8d837cd22d48b01fea Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Wed, 22 Aug 2018 09:09:01 -0500 Subject: [PATCH 389/638] minor whitespace change Trying to get travis-ci build to pass for "git diff --exit-code" command --- build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sh b/build.sh index 8e9d4342..f68cd447 100755 --- a/build.sh +++ b/build.sh @@ -13,3 +13,4 @@ mkdir manifests # optional, but we would like to generate yaml, not json jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml; rm -f {}' -- {} + From 2a9a31875ae0a8f9b4c432e53e5e61af2f35a24a Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Thu, 23 Aug 2018 11:47:08 +0200 Subject: [PATCH 390/638] kube-prometheus: fix root typo in node-exporter Currently, we are mounting `/root` from the host as `/host/root`. Instead simply `/` from the host should be mounted. This fixes it Signed-off-by: Sergiusz Urbaniak --- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 5791fd3b..7d8aadd8 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -74,7 +74,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'); local rootVolumeName = 'root'; - local rootVolume = volume.fromHostPath(rootVolumeName, '/root'); + local rootVolume = volume.fromHostPath(rootVolumeName, '/'); local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root'). withMountPropagation('HostToContainer'). withReadOnly(true); From 8a45ecfbcbeceb9fbe7b8688345073e7fd1549a6 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Thu, 23 Aug 2018 12:01:02 +0200 Subject: [PATCH 391/638] kube-prometheus: update dependencies of local changes Signed-off-by: Sergiusz Urbaniak --- jsonnetfile.lock.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 431f5a2c..8f790443 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "c9631844b82cdce4eac7e05e21543018ea39acfc" + "version": "43d93161ee1d5215a25e6a35582605cf2f4c26dd" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "b46003c21f1eba3b7d9d361b44e77305ccff601e" + "version": "a2b417a9aa27d508d1c0b730ace0dff20b3b7bf7" }, { "name": "grafonnet", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "942cd2349e27c2510c71158ee9ca953df33724f2" + "version": "9a20f81c9007e4c7409dd0b3edda1a7a78ad2c63" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "2921ab670fa646c2cbb3ba53b5c5bc11ea47632f" + "version": "bdc333359b618b7519660ead7a4ef30288f8d350" } ] } \ No newline at end of file From af5fb9ee09ecbc33a69b6aae19d85f710d571822 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Thu, 23 Aug 2018 13:38:13 +0200 Subject: [PATCH 392/638] kubernetes-prometheus: regenerate Signed-off-by: Sergiusz Urbaniak --- manifests/grafana-dashboardDefinitions.yaml | 32 ++++++++++----------- manifests/node-exporter-daemonset.yaml | 2 +- manifests/prometheus-rules.yaml | 21 ++++++++++---- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af2b2b0a..db2d0939 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1920,7 +1920,7 @@ items: "steppedLine": false, "targets": [ { - "expr": ":node_cpu_utilisation:avg1m", + "expr": "1 - avg(rate(node_cpu{mode=\"idle\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2172,7 +2172,7 @@ items: "steppedLine": false, "targets": [ { - "expr": ":node_memory_utilisation:", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers:sum) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2256,7 +2256,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2340,7 +2340,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(node_memory_MemTotal)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2435,7 +2435,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total[1m])) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2664,7 +2664,7 @@ items: ], "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2682,7 +2682,7 @@ items: "step": 10 }, { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2700,7 +2700,7 @@ items: "step": 10 }, { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3247,7 +3247,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[1m])) by (pod_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3476,7 +3476,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3494,7 +3494,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3512,7 +3512,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4086,7 +4086,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4315,7 +4315,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4333,7 +4333,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4351,7 +4351,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index a2669187..b3febf8c 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -76,5 +76,5 @@ spec: path: /sys name: sys - hostPath: - path: /root + path: / name: root diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index bf07b9a4..121b974f 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -13,6 +13,11 @@ spec: - expr: | sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, pod_name, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum @@ -151,6 +156,12 @@ spec: / sum(node_memory_MemTotal{job="node-exporter"}) record: ':node_memory_utilisation:' + - expr: | + sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers:sum + - expr: | + sum(node_memory_MemTotal{job="node-exporter"}) + record: :node_memory_MemTotal:sum - expr: | sum by (node) ( (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) @@ -796,10 +807,10 @@ spec: }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | - sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 + (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) / - sum(rate(rest_client_requests_total[5m])) by (instance, job) - > 1 + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 for: 15m labels: severity: warning @@ -829,7 +840,7 @@ spec: for {{$labels.verb}} {{$labels.resource}}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning @@ -839,7 +850,7 @@ spec: for {{$labels.verb}} {{$labels.resource}}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical From 4c90bd93b01e5762ec7a8efa1b8312ac309015c5 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 27 Aug 2018 13:58:06 -0600 Subject: [PATCH 393/638] kube-prometheus: Update Prometheus Operator dependency --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 78e9f61f..9ba63cfc 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.23.1" + "version": "v0.23.2" }, { "name": "etcd-mixin", From b213437fb6976ad07a23218f64398babcda04446 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 27 Aug 2018 14:07:15 -0600 Subject: [PATCH 394/638] kube-prometheus: Update jsonnet dependencies --- jsonnetfile.lock.json | 10 +- .../0prometheus-operator-deployment.yaml | 4 +- manifests/grafana-dashboardDefinitions.yaml | 155 +++++++++--------- 3 files changed, 83 insertions(+), 86 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 8f790443..5f6e091a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "43d93161ee1d5215a25e6a35582605cf2f4c26dd" + "version": "66226d27f1fca6096c420b46b097b9e2475189e6" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "a2b417a9aa27d508d1c0b730ace0dff20b3b7bf7" + "version": "4090e091fee875fd2baec1531a7ef9c2ab58c99b" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "0fdef020a6360415d2c8fdc82b29122583e4df05" + "version": "d21784739a9adc7992c0382d1efa42be4ddb3044" }, { "name": "grafana-builder", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "ba92b2f232ef24684b9dc6bde03b74e1630909a6" + "version": "27b1eb72d9d93e5ab447121e2b884bc558bde01d" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "bdc333359b618b7519660ead7a4ef30288f8d350" + "version": "af85949b416547ed0989b396a28fe77f65978828" } ] } \ No newline at end of file diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 052511f4..6894d92c 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.1 - image: quay.io/coreos/prometheus-operator:v0.23.1 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.2 + image: quay.io/coreos/prometheus-operator:v0.23.2 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index db2d0939..551647b8 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"} - node_filesystem_avail{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace)) by (pod,namespace) / scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[24]\"}) by (device,pod,namespace))) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1712,10 +1712,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(max by (device, node) (node_filesystem_avail{fstype=\u007e\"ext[24]\"})) / sum(max by (device, node) (node_filesystem_size{fstype=\u007e\"ext[24]\"}))", + "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (namespace, pod, device)\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Disk", + "legendFormat": "{{device}}", "legendLink": null, "step": 10 } @@ -4914,7 +4914,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -5124,7 +5123,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -5165,7 +5163,7 @@ items: ], "spaceLength": 10, - "span": 6, + "span": 9, "stack": false, "steppedLine": false, "targets": [ @@ -5309,7 +5307,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -5515,7 +5512,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -5563,7 +5559,7 @@ items: } ], "spaceLength": 10, - "span": 9, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -5630,84 +5626,95 @@ items: ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, "gridPos": { }, "id": 9, - "interval": null, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "(\n sum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n- sum(node_filesystem_avail{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n) * 100\n /\nsum(node_filesystem_size{job=\"node-exporter\", device!=\"rootfs\", instance=\"$instance\"})\n", + "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}\n- node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}) by (namespace, pod, device)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "{{device}}", + "refId": "A" } ], - "thresholds": "80, 90", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, "title": "Disk Space Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "N/A", - "value": "null" + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "valueName": "current" + ] } ], "repeat": null, @@ -5721,7 +5728,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -6026,7 +6032,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -6067,7 +6072,6 @@ items: ], "spaceLength": 10, - "span": 12, "stack": false, "steppedLine": false, "targets": [ @@ -6145,7 +6149,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -6186,7 +6189,6 @@ items: ], "spaceLength": 10, - "span": 12, "stack": false, "steppedLine": false, "targets": [ @@ -6250,7 +6252,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -6291,7 +6292,6 @@ items: ], "spaceLength": 10, - "span": 12, "stack": false, "steppedLine": false, "targets": [ @@ -6516,7 +6516,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "cacheTimeout": null, @@ -7101,7 +7100,6 @@ items: { "collapse": false, "collapsed": false, - "height": "250px", "panels": [ { "aliasColors": { @@ -7142,7 +7140,6 @@ items: ], "spaceLength": 10, - "span": 12, "stack": false, "steppedLine": false, "targets": [ From ae61963157669c44e38ac74d92af70f2e161a53d Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Thu, 30 Aug 2018 11:39:30 -0500 Subject: [PATCH 395/638] kube-prometheus: add e.g. of using persistent storage for the prom tsdb --- examples/prometheus-pvc.jsonnet | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/prometheus-pvc.jsonnet diff --git a/examples/prometheus-pvc.jsonnet b/examples/prometheus-pvc.jsonnet new file mode 100644 index 00000000..75b250fe --- /dev/null +++ b/examples/prometheus-pvc.jsonnet @@ -0,0 +1,59 @@ + +// Reference info: documentation for https://github.com/ksonnet/ksonnet-lib can be found at http://g.bryan.dev.hepti.center +// +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; // https://github.com/ksonnet/ksonnet-lib/blob/master/ksonnet.beta.3/k.libsonnet - imports k8s.libsonnet +// * https://github.com/ksonnet/ksonnet-lib/blob/master/ksonnet.beta.3/k8s.libsonnet defines things such as "persistentVolumeClaim:: {" +// +local pvc = k.core.v1.persistentVolumeClaim; // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#persistentvolumeclaim-v1-core (defines variable named 'spec' of type 'PersistentVolumeClaimSpec') + +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + + prometheus+:: { + prometheus+: { + spec+: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + // If a value isn't specified for 'retention', then by default the '--storage.tsdb.retention=24h' arg will be passed to prometheus by prometheus-operator. + // The possible values for a prometheus are: + // * https://github.com/prometheus/common/blob/c7de230/model/time.go#L178 specifies "^([0-9]+)(y|w|d|h|m|s|ms)$" (years weeks days hours minutes seconds milliseconds) + retention: "30d", + + // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md + // By default (if the following 'storage.volumeClaimTemplate' isn't created), prometheus will be created with an EmptyDir for the 'prometheus-k8s-db' volume (for the prom tsdb). + // This 'storage.volumeClaimTemplate' causes the following to be automatically created (via dynamic provisioning) for each prometheus pod: + // * PersistentVolumeClaim (and a corresponding PersistentVolume) + // * the actual volume (per the StorageClassName specified below) + storage: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#storagespec + volumeClaimTemplate: // (same link as above where the 'pvc' variable is defined) + pvc.new() + // http://g.bryan.dev.hepti.center/core/v1/persistentVolumeClaim/#core.v1.persistentVolumeClaim.new + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + + // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#resourcerequirements-v1-core (defines 'requests'), + // and https://kubernetes.io/docs/concepts/policy/resource-quotas/#storage-resource-quota (defines 'requests.storage') + pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }) + + + // A StorageClass of the following name (which can be seen via `kubectl get storageclass` from a node in the given K8s cluster) must exist prior to kube-prometheus being deployed. + pvc.mixin.spec.withStorageClassName('ssd'), + + // The following 'selector' is only needed if you're using manual storage provisioning (https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md#manual-storage-provisioning). + // And note that this is not supported/allowed by AWS - uncommenting the following 'selector' line (when deploying kube-prometheus to a K8s cluster in AWS) will cause the pvc to be stuck in the Pending status and have the following error: + // * 'Failed to provision volume with StorageClass "ssd": claim.Spec.Selector is not supported for dynamic provisioning on AWS' + //pvc.mixin.spec.selector.withMatchLabels({}), + }, // storage + }, // spec + }, // prometheus + }, // prometheus + + }; + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } From 35c90355ba0ab600531806255192ecfa25d2fb24 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Thu, 30 Aug 2018 11:43:17 -0500 Subject: [PATCH 396/638] kube-prometheus: add link to grafana config --- README.md | 2 +- examples/minikube.jsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f05b92a8..128933c7 100644 --- a/README.md +++ b/README.md @@ -250,7 +250,7 @@ These are the available fields with their respective default values: The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `_config` field. For example to allow anonymous access to grafana, add the following `_config` section: ``` grafana+:: { - config: { + config: { // http://docs.grafana.org/installation/configuration/ sections: { "auth.anonymous": {enabled: true}, }, diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index e1440798..850514fd 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -10,7 +10,7 @@ local kp = config: importstr 'alertmanager-config.yaml', }, grafana+:: { - config: { + config: { // http://docs.grafana.org/installation/configuration/ sections: { // Do not require grafana users to login/authenticate "auth.anonymous": {enabled: true}, From b33894020878678e3d32c42c4e6ad66d4beeca62 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Wed, 5 Sep 2018 12:00:40 +0200 Subject: [PATCH 397/638] kube-prometheus: bump kubernetes-mixins This bumps the kubernetes-mixin dependency to add https://github.com/kubernetes-monitoring/kubernetes-mixin/pull/76. --- jsonnetfile.lock.json | 10 ++++---- manifests/grafana-dashboardDefinitions.yaml | 28 ++++++++++----------- manifests/prometheus-rules.yaml | 20 ++++++++++----- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5f6e091a..2ac75e69 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "66226d27f1fca6096c420b46b097b9e2475189e6" + "version": "f6c5c4311b8c8ad699cfa718a6e1226780b8b3a5" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4090e091fee875fd2baec1531a7ef9c2ab58c99b" + "version": "fee96cc51d22f196c982c6152cc8aee2585f65c0" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "d21784739a9adc7992c0382d1efa42be4ddb3044" + "version": "7be7f8e4e8da37cac104d2655ca22fdb8a93ebcd" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "8c4610783991b82ff12e485d24ac4f82d8839743" + "version": "e6fe81715dd802b4c9d9c64f2c44ba6ee56d2000" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "af85949b416547ed0989b396a28fe77f65978828" + "version": "1df1ddff4361ed7f2c0f33571923511889a115ce" } ] } \ No newline at end of file diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 551647b8..7bae26bb 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1712,7 +1712,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (namespace, pod, device)\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", + "expr": "node:node_filesystem_usage:\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -2800,7 +2800,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -3029,7 +3029,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_rss) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3047,7 +3047,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3065,7 +3065,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3612,7 +3612,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\"}) by (pod_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3841,7 +3841,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3859,7 +3859,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3877,7 +3877,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4451,7 +4451,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4680,7 +4680,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4707,7 +4707,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4716,7 +4716,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5669,7 +5669,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}\n- node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}) by (namespace, pod, device)\n", + "expr": "node:node_filesystem_usage:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 121b974f..1809db6b 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -11,26 +11,26 @@ spec: - name: k8s.rules rules: - expr: | - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, pod_name, container_name) ( - rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m]) + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) ) record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - expr: | - sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) + sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace, pod_name) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, label_name) ( - sum(container_memory_usage_bytes{job="kubelet",image!=""}) by (pod_name, namespace) + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) @@ -232,6 +232,14 @@ spec: node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate + - expr: | + max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' - expr: | sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) @@ -569,7 +577,7 @@ spec: message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | - sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 for: 1h labels: severity: critical From 65364967160603a38c945b1a05540a99e2b406a2 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 5 Sep 2018 22:42:28 +0200 Subject: [PATCH 398/638] kube-prometheus: Improve disk alert Only trigger disk running full alerts when the disk capacity is over 85% and use recording rule of the kubernetes-mixin to only consider real physical filesystems. --- jsonnet/kube-prometheus/alerts/node.libsonnet | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 5c24f09f..27039f4e 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -7,11 +7,10 @@ { alert: 'NodeDiskRunningFull', annotations: { - description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})', - summary: 'Node disk is running full within 24 hours', + message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} is running full within the next 24 hours.', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{%(nodeExporterSelector)s} + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) ||| % $._config, 'for': '30m', labels: { @@ -21,11 +20,10 @@ { alert: 'NodeDiskRunningFull', annotations: { - description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})', - summary: 'Node disk is running full within 2 hours', + message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} is running full within the next 2 hours.', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{%(nodeExporterSelector)s} + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) ||| % $._config, 'for': '10m', labels: { From 2c288a798361138810e38c93a08ed3a6be61f1a0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 5 Sep 2018 23:05:56 +0200 Subject: [PATCH 399/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 4 ++-- manifests/prometheus-rules.yaml | 18 ++++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 2ac75e69..a7b3dab8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "f6c5c4311b8c8ad699cfa718a6e1226780b8b3a5" + "version": "c22d20f7fc8eb359d4f99bc440175dee0d31c3cf" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "fee96cc51d22f196c982c6152cc8aee2585f65c0" + "version": "ab3d27befcdb31ec286790b8e0ca49bf1deecce5" }, { "name": "grafonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 1809db6b..7958b926 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -781,10 +781,10 @@ spec: annotations: message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is expected to fill up within four - days. + days. Currently {{ $value }} bytes are available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 + kubelet_volume_stats_available_bytes{job="kubelet"} and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical @@ -954,21 +954,19 @@ spec: rules: - alert: NodeDiskRunningFull annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} is running full within the next 24 hours. expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{job="node-exporter"} + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m labels: severity: warning - alert: NodeDiskRunningFull annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} is running full within the next 2 hours. expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{job="node-exporter"} + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) for: 10m labels: severity: critical From a9527247f68fc28b3f6d7c77e8677f26869058d8 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 6 Sep 2018 19:02:46 +0200 Subject: [PATCH 400/638] contrib/kube-prometheus: Use default timezone in dashboards --- jsonnetfile.lock.json | 4 ++-- manifests/grafana-dashboardDefinitions.yaml | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index a7b3dab8..613b0ad8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "c22d20f7fc8eb359d4f99bc440175dee0d31c3cf" + "version": "ce4ab08d6791161267204d9a61588e64f1b57e05" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "ab3d27befcdb31ec286790b8e0ca49bf1deecce5" + "version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c" }, { "name": "grafonnet", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 7bae26bb..e8f39619 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -907,7 +907,7 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "", "title": "K8s / USE Method / Cluster", "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 @@ -1850,7 +1850,7 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "", "title": "K8s / USE Method / Node", "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 @@ -3178,7 +3178,7 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "", "title": "K8s / Compute Resources / Cluster", "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 @@ -4017,7 +4017,7 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "", "title": "K8s / Compute Resources / Namespace", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 @@ -4883,7 +4883,7 @@ items: "30d" ] }, - "timezone": "utc", + "timezone": "", "title": "K8s / Compute Resources / Pod", "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 @@ -6001,7 +6001,7 @@ items: "30d" ] }, - "timezone": "browser", + "timezone": "", "title": "Nodes", "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 @@ -6485,7 +6485,7 @@ items: "30d" ] }, - "timezone": "browser", + "timezone": "", "title": "Pods", "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 @@ -7335,7 +7335,7 @@ items: "30d" ] }, - "timezone": "browser", + "timezone": "", "title": "StatefulSets", "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 From 34dd0a798a95ef793a1bea33cbeb6e4c19f1b743 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 7 Sep 2018 15:21:16 +0200 Subject: [PATCH 401/638] contrib/.../alerts: improve alert messages --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 2 +- jsonnet/kube-prometheus/alerts/general.libsonnet | 4 ++-- jsonnet/kube-prometheus/alerts/node.libsonnet | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index d283cc18..90e3f5c7 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -21,7 +21,7 @@ { alert: 'AlertmanagerDownOrMissing', annotations: { - description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.', + description: 'An unexpected number of Alertmanagers were scraped or disappeared from discovery.', summary: 'Alertmanager down or missing', }, expr: ||| diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index 6f3e4534..ecdd9a21 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -7,7 +7,7 @@ { alert: 'TargetDown', annotations: { - description: '{{ $value }}% of {{ $labels.job }} targets are down.', + description: '{{ $value }}% of the {{ $labels.job }} targets are down.', summary: 'Targets are down', }, expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10', @@ -19,7 +19,7 @@ { alert: 'DeadMansSwitch', annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.', + description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', summary: 'Alerting DeadMansSwitch', }, expr: 'vector(1)', diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 27039f4e..37fff428 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -7,7 +7,7 @@ { alert: 'NodeDiskRunningFull', annotations: { - message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} is running full within the next 24 hours.', + message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 24 hours.', }, expr: ||| (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) @@ -20,7 +20,7 @@ { alert: 'NodeDiskRunningFull', annotations: { - message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} is running full within the next 2 hours.', + message: 'Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 2 hours.', }, expr: ||| (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) From 483a1c115fb06fac4064ae288f8c513ea056e12d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 14:59:50 +0200 Subject: [PATCH 402/638] contrib/kube-prometheus: Fix outdated alerts to use message annotation --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 9 +++------ jsonnet/kube-prometheus/alerts/general.libsonnet | 6 ++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index 90e3f5c7..f3bbd0ea 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -7,8 +7,7 @@ { alert: 'AlertmanagerConfigInconsistent', annotations: { - description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', - summary: 'Configuration out of sync', + message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', }, expr: ||| count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 @@ -21,8 +20,7 @@ { alert: 'AlertmanagerDownOrMissing', annotations: { - description: 'An unexpected number of Alertmanagers were scraped or disappeared from discovery.', - summary: 'Alertmanager down or missing', + message: 'An unexpected number of Alertmanagers were scraped or disappeared from discovery.', }, expr: ||| label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1 @@ -35,8 +33,7 @@ { alert: 'AlertmanagerFailedReload', annotations: { - description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", - summary: "Alertmanager's configuration reload failed", + message: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", }, expr: ||| alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index ecdd9a21..6ac25703 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -7,8 +7,7 @@ { alert: 'TargetDown', annotations: { - description: '{{ $value }}% of the {{ $labels.job }} targets are down.', - summary: 'Targets are down', + message: '{{ $value }}% of the {{ $labels.job }} targets are down.', }, expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10', 'for': '10m', @@ -19,8 +18,7 @@ { alert: 'DeadMansSwitch', annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', - summary: 'Alerting DeadMansSwitch', + message: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', }, expr: 'vector(1)', labels: { From 8d38e81521b9ef1f51bdca72be2fa7db041f2764 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 14:43:46 +0200 Subject: [PATCH 403/638] contrib/kube-prometheus: Create missing Prometheus operator alerts --- .../kube-prometheus/alerts/alerts.libsonnet | 3 +- .../alerts/prometheus-operator.libsonnet | 50 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet index 19568a24..1b2d94eb 100644 --- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -1,4 +1,5 @@ (import 'alertmanager.libsonnet') + (import 'general.libsonnet') + (import 'node.libsonnet') + -(import 'prometheus.libsonnet') +(import 'prometheus.libsonnet') + +(import 'prometheus-operator.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet new file mode 100644 index 00000000..33dd97ce --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -0,0 +1,50 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus-operator', + rules: [ + { + alert: 'PrometheusOperatorAlertmanagerReconcileErrors', + expr: ||| + rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Alertmanager in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorPrometheusReconcileErrors', + expr: ||| + rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorNodeLookupErrors', + expr: ||| + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + }, + 'for': '10m', + }, + ], + }, + ], + }, +} From 5a935379d649ce0fd69f67497d64592929f977b8 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 14:50:47 +0200 Subject: [PATCH 404/638] contrib/kube-prometheus: Run jb update and generate all manifests --- jsonnetfile.lock.json | 8 ++--- manifests/grafana-dashboardDefinitions.yaml | 2 +- manifests/prometheus-rules.yaml | 40 +++++++++++++++++---- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 613b0ad8..9817f7a9 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ce4ab08d6791161267204d9a61588e64f1b57e05" + "version": "00c64bc438d2acf9c808388fe1e5d733e92b0c3b" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c" + "version": "c70814dcafce1b51357938e09ee1192998a95706" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1df1ddff4361ed7f2c0f33571923511889a115ce" + "version": "a7b1306ecfefeabe48286403b260513786289922" } ] -} \ No newline at end of file +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e8f39619..af68467a 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4707,7 +4707,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 7958b926..17b0de1d 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -838,7 +838,7 @@ spec: the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - kubelet_running_pod_count{job="kubelet"} > 100 + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning @@ -914,8 +914,8 @@ spec: severity: critical - alert: AlertmanagerDownOrMissing annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. + description: An unexpected number of Alertmanagers were scraped or disappeared + from discovery. summary: Alertmanager down or missing expr: | label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 @@ -936,7 +936,7 @@ spec: rules: - alert: TargetDown annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of the {{ $labels.job }} targets are down.' summary: Targets are down expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m @@ -944,7 +944,7 @@ spec: severity: warning - alert: DeadMansSwitch annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting + description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. summary: Alerting DeadMansSwitch expr: vector(1) @@ -955,7 +955,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 24 hours. + }}/{{ $labels.pod }} will be full within the next 24 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m @@ -964,7 +964,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 2 hours. + }}/{{ $labels.pod }} will be full within the next 2 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) for: 10m @@ -1071,3 +1071,29 @@ spec: for: 10m labels: severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorAlertmanagerReconcileErrors + annotations: + message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorPrometheusReconcileErrors + annotations: + message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.01 + for: 10m + labels: + severity: warning From 8965c3e7b60ab3767c15d7c948096e493e8c8c47 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 17:44:47 +0200 Subject: [PATCH 405/638] *: Add missing newline at the end of jsonnetfile.json --- jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index b4ebb0f2..619586b2 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -11,4 +11,4 @@ "version": "." } ] -} \ No newline at end of file +} From 407aaa5e2feb5a56c270091c44bf7b4440549d4b Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 11:08:15 +0200 Subject: [PATCH 406/638] contrib/kube-prometheus: Alert in 10% erros when reconciling Prom & Alertmanager --- .../kube-prometheus/alerts/prometheus-operator.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index 33dd97ce..dd176271 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -7,7 +7,7 @@ { alert: 'PrometheusOperatorAlertmanagerReconcileErrors', expr: ||| - rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', @@ -20,7 +20,7 @@ { alert: 'PrometheusOperatorPrometheusReconcileErrors', expr: ||| - rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', @@ -33,7 +33,7 @@ { alert: 'PrometheusOperatorNodeLookupErrors', expr: ||| - rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.01 + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', From b308b25accfd0ad32aed0d90f8f4fb76e249f10a Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 11:29:45 +0200 Subject: [PATCH 407/638] contrib/kube-prometheus: Generate new rules based on latest jsonnet changes --- jsonnetfile.lock.json | 4 ++-- manifests/prometheus-rules.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 9817f7a9..035f78be 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "00c64bc438d2acf9c808388fe1e5d733e92b0c3b" + "version": "34035de0f6c20ed3d84ba9a28e23765f11cb0b9f" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a7b1306ecfefeabe48286403b260513786289922" + "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998" } ] } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 17b0de1d..4f4de5d4 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1077,7 +1077,7 @@ spec: annotations: message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -1085,7 +1085,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -1093,7 +1093,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.01 + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning From 24141c464ff8be00b495e13750730ca072f6297e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 13:33:49 +0200 Subject: [PATCH 408/638] contrib/kube-prometheus: Improve consistency of Prometheus Operator alerts --- .../kube-prometheus/alerts/prometheus-operator.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index dd176271..f851caa0 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -13,7 +13,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Alertmanager in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, @@ -26,7 +26,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, @@ -39,7 +39,7 @@ severity: 'warning', }, annotations: { - message: 'Errors while reconciling Prometheus in {{$labels.namespace}} namespace.', + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, From 7e34199dd8d57c10b1a2e26faf3052388ca93237 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 14 Sep 2018 13:36:43 +0200 Subject: [PATCH 409/638] contrib: jsonnetfile newline --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 9ba63cfc..bb4b79f7 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -51,4 +51,4 @@ "version": "master" } ] -} +} \ No newline at end of file From df65f57fb48bb1564a73f911ccc1bf1d8535476d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 14 Sep 2018 13:46:18 +0200 Subject: [PATCH 410/638] contrib/kube-prometheus: Generate new manifests after fixing tyops in rules --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 035f78be..e6904980 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "34035de0f6c20ed3d84ba9a28e23765f11cb0b9f" + "version": "bffc85d6e76f6341d5370af68ea980030ab402e8" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 4f4de5d4..221fa726 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1075,7 +1075,8 @@ spec: rules: - alert: PrometheusOperatorAlertmanagerReconcileErrors annotations: - message: Errors while reconciling Alertmanager in {{$labels.namespace}} namespace. + message: Errors while reconciling Alertmanager in {{ $labels.namespace }} + Namespace. expr: | rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m @@ -1083,7 +1084,7 @@ spec: severity: warning - alert: PrometheusOperatorPrometheusReconcileErrors annotations: - message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m @@ -1091,7 +1092,7 @@ spec: severity: warning - alert: PrometheusOperatorNodeLookupErrors annotations: - message: Errors while reconciling Prometheus in {{$labels.namespace}} namespace. + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m From 345f0457390cb60c3a870ebda9b3ff3506e8af7f Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Wed, 19 Sep 2018 11:54:01 +0100 Subject: [PATCH 411/638] contrib/kube-prometheus: Update README with up-to-date default versions With those changes, diff does show any differences between generated manifests with and without the version override in a top level jsonnet file. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 128933c7..b682de5b 100644 --- a/README.md +++ b/README.md @@ -187,13 +187,13 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.15.0", - nodeExporter: "v0.15.2", + alertmanager: "v0.15.2", + nodeExporter: "v0.16.0", kubeStateMetrics: "v1.3.1", kubeRbacProxy: "v0.3.1", addonResizer: "1.0", - prometheusOperator: "v0.18.1", - prometheus: "v2.2.1", + prometheusOperator: "v0.23.2", + prometheus: "v2.3.2", }, imageRepos+:: { From 2c0f18ae0951f2331a9cb7eeceb8f6ebf9273eb8 Mon Sep 17 00:00:00 2001 From: David Tesar Date: Fri, 21 Sep 2018 14:14:41 -0700 Subject: [PATCH 412/638] Update to grafana 5.2.4 --- manifests/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index cb8cc9d8..e378f689 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.2.1 + - image: grafana/grafana:5.2.4 name: grafana ports: - containerPort: 3000 From be14369fad13889c6b65d2867d07b47a3a99aecb Mon Sep 17 00:00:00 2001 From: David Tesar Date: Fri, 21 Sep 2018 14:22:46 -0700 Subject: [PATCH 413/638] Add Prometheus dashboard instructions --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index b682de5b..58c1f91b 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,18 @@ $ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may $ kubectl delete -f manifests/ || true ``` +### Access the dashboards + +Prometheus, Grafana, and Alertmanager dashboards can be accessed after running the quickstart via the following commands: + +Prometheus + +```shell +export PROMETHEUS=$(kubectl get pods --namespace monitoring -l "app=prometheus" -o jsonpath="{.items[0].metadata.name}") +kubectl --namespace monitoring port-forward $PROMETHEUS 3000:3000 +``` +Then access via [http://localhost:3000](http://localhost:3000) + ## Customizing Kube-Prometheus This section: From 391158cfd0a8844783336d843746941f86f3bbd9 Mon Sep 17 00:00:00 2001 From: David Tesar Date: Fri, 21 Sep 2018 15:07:48 -0700 Subject: [PATCH 414/638] Add rest of dashboards --- README.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 58c1f91b..787c7b5b 100644 --- a/README.md +++ b/README.md @@ -82,9 +82,25 @@ Prometheus ```shell export PROMETHEUS=$(kubectl get pods --namespace monitoring -l "app=prometheus" -o jsonpath="{.items[0].metadata.name}") -kubectl --namespace monitoring port-forward $PROMETHEUS 3000:3000 +kubectl --namespace monitoring port-forward $PROMETHEUS 9090:9090 ``` -Then access via [http://localhost:3000](http://localhost:3000) +Then access via [http://localhost:9090](http://localhost:9090) + +Grafana + +```shell +export GRAFANA=$(kubectl get pods --namespace monitoring -l "app=grafana" -o jsonpath="{.items[0].metadata.name}") +kubectl --namespace monitoring port-forward $GRAFANA 3000:3000 +``` +Then access via [http://localhost:3000](http://localhost:3000) and use the default grafana user:password of `admin:admin`. + +Alert Manager + +```shell +export ALERTMGR=$(kubectl get pods --namespace monitoring -l "app=alertmanager" -o jsonpath="{.items[0].metadata.name}") +kubectl --namespace monitoring port-forward $ALERTMGR 9093:9093 +``` +Then access via [http://localhost:9093](http://localhost:9093) ## Customizing Kube-Prometheus From bdeea07556ff3f4a6a5eaa991f0033bbab9072c7 Mon Sep 17 00:00:00 2001 From: David Tesar Date: Sun, 23 Sep 2018 17:37:40 -0700 Subject: [PATCH 415/638] Add link to ingress instructions --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 787c7b5b..3c40a949 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,9 @@ $ kubectl delete -f manifests/ || true ### Access the dashboards -Prometheus, Grafana, and Alertmanager dashboards can be accessed after running the quickstart via the following commands: +Prometheus, Grafana, and Alertmanager dashboards can be accessed quickly using `kubectl port-forward` after running the quickstart via the commands below. + +> Note: There are instructions on how to route to these pods behdind an ingress controller in the [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) section. Prometheus From 71a522696253f70dfa0cf48ed625c15cf4c5fbc7 Mon Sep 17 00:00:00 2001 From: David Tesar Date: Mon, 24 Sep 2018 13:11:33 -0700 Subject: [PATCH 416/638] address feedback --- README.md | 16 ++++++++-------- manifests/grafana-deployment.yaml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3c40a949..84bf6c28 100644 --- a/README.md +++ b/README.md @@ -76,32 +76,32 @@ $ kubectl delete -f manifests/ || true ### Access the dashboards -Prometheus, Grafana, and Alertmanager dashboards can be accessed quickly using `kubectl port-forward` after running the quickstart via the commands below. - +Prometheus, Grafana, and Alertmanager dashboards can be accessed quickly using `kubectl port-forward` after running the quickstart via the commands below. Kubernetes 1.10 or later is required. + > Note: There are instructions on how to route to these pods behdind an ingress controller in the [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) section. Prometheus ```shell -export PROMETHEUS=$(kubectl get pods --namespace monitoring -l "app=prometheus" -o jsonpath="{.items[0].metadata.name}") -kubectl --namespace monitoring port-forward $PROMETHEUS 9090:9090 +kubectl --namespace monitoring port-forward svc/prometheus-k8s 9090 ``` + Then access via [http://localhost:9090](http://localhost:9090) Grafana ```shell -export GRAFANA=$(kubectl get pods --namespace monitoring -l "app=grafana" -o jsonpath="{.items[0].metadata.name}") -kubectl --namespace monitoring port-forward $GRAFANA 3000:3000 +kubectl --namespace monitoring port-forward svc/grafana 3000 ``` + Then access via [http://localhost:3000](http://localhost:3000) and use the default grafana user:password of `admin:admin`. Alert Manager ```shell -export ALERTMGR=$(kubectl get pods --namespace monitoring -l "app=alertmanager" -o jsonpath="{.items[0].metadata.name}") -kubectl --namespace monitoring port-forward $ALERTMGR 9093:9093 +kubectl --namespace monitoring port-forward svc/alertmanager-main 9093 ``` + Then access via [http://localhost:9093](http://localhost:9093) ## Customizing Kube-Prometheus diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index e378f689..cb8cc9d8 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.2.4 + - image: grafana/grafana:5.2.1 name: grafana ports: - containerPort: 3000 From 59fd4cd63e00023ce3b712d3535e43e3514fb4b6 Mon Sep 17 00:00:00 2001 From: Elisiano Petrini Date: Wed, 26 Sep 2018 15:20:16 +0200 Subject: [PATCH 417/638] Add initial thanos support to kube-prometheus This is distributed as a library that the user is meant to import. example: ``` kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/kube-prometheus-thanos.libsonnet') + { _config+:: {...} }; ``` --- .../kube-prometheus-thanos.libsonnet | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet new file mode 100644 index 00000000..4ef00612 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -0,0 +1,64 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + + +{ + _config+:: { + versions+:: { + thanos: 'v0.1.0', + }, + imageRepos+:: { + thanos: 'improbable/thanos', + }, + }, + prometheus+:: { + prometheus+: { + spec+: { + podMetadata+: { + labels+: { 'thanos-peer': 'true' }, + }, + thanos+: { + peers: 'thanos-peers.' + $._config.namespace + '.svc:10900', + version: $._config.versions.thanos, + baseImage: $._config.imageRepos.thanos, + }, + }, + }, + thanosQueryDeployment: + local deployment = k.apps.v1beta2.deployment; + local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; + local containerPort = container.portsType; + + local thanosQueryContainer = + container.new('thanos-query', $._config.imageRepos.thanos + ':' + $._config.versions.thanos) + + container.withPorts([ + containerPort.newNamed('http', 10902), + containerPort.newNamed('grpc', 10901), + containerPort.newNamed('cluster', 10900), + ]) + + container.withArgs([ + 'query', + '--log.level=debug', + '--query.replica-label=prometheus_replica', + '--cluster.peers=thanos-peers.' + $._config.namespace + '.svc:10900', + ]); + local podLabels = { app: 'thanos-query', 'thanos-peer': 'true' }; + deployment.new('thanos-query', 1, thanosQueryContainer, podLabels) + + deployment.mixin.metadata.withNamespace($._config.namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels) + + deployment.mixin.spec.template.spec.withServiceAccountName('prometheus-' + $._config.prometheus.name), + thanosQueryService: + local thanosQueryPort = servicePort.newNamed('http-query', 9090, 'http'); + service.new('thanos-query', { app: 'thanos-query' }, thanosQueryPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ app: 'thanos-query' }), + thanosPeerService: + local thanosPeerPort = servicePort.newNamed('cluster', 10900, 'cluster'); + service.new('thanos-peers', { 'thanos-peer': 'true' }, thanosPeerPort) + + service.mixin.spec.withType('ClusterIP') + + service.mixin.spec.withClusterIp('None'), + + }, +} From 57a0e161ffdf7cff7a5a747c2fe0cde6750deed1 Mon Sep 17 00:00:00 2001 From: David Lefever Date: Wed, 26 Sep 2018 22:57:11 +0200 Subject: [PATCH 418/638] contrib/kube-prometheus: Run jb update and generate all manifests. --- jsonnetfile.lock.json | 12 +- manifests/grafana-dashboardDefinitions.yaml | 2 +- manifests/prometheus-rules.yaml | 124 +++++++++++--------- 3 files changed, 78 insertions(+), 60 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e6904980..4fc52ff8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "bffc85d6e76f6341d5370af68ea980030ab402e8" + "version": "2694cabc85ed89b3c8ac0865bcbc29d72e52eb2f" }, { "name": "ksonnet", @@ -18,7 +18,7 @@ "subdir": "" } }, - "version": "83f20ee933bcd13fcf4ad1b49a40c92135c5569c" + "version": "ed0796f3cb97ebc35ae54f543b1814a7c8dae305" }, { "name": "kubernetes-mixin", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "c70814dcafce1b51357938e09ee1192998a95706" + "version": "19da1eb2f2558dad0f8d9e280cc1fe7bc835677b" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "7be7f8e4e8da37cac104d2655ca22fdb8a93ebcd" + "version": "64147daa1267a2571ef95609550b782ec9807c52" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "e6fe81715dd802b4c9d9c64f2c44ba6ee56d2000" + "version": "94aef231932810633416bfe596a41dbad2b1ebb9" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998" + "version": "a3e242d80ae1a13ae57904fc12e91fe4c9ecf972" } ] } diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af68467a..1f9a7a88 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3643,7 +3643,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "decbytes", "label": null, "logBase": 1, "max": null, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 221fa726..899c2ecc 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -564,8 +564,8 @@ spec: rules: - alert: KubePodCrashLooping annotations: - message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf "%.2f" $value }} / second' + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 @@ -574,7 +574,8 @@ spec: severity: critical - alert: KubePodNotReady annotations: - message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 @@ -583,8 +584,9 @@ spec: severity: critical - alert: KubeDeploymentGenerationMismatch annotations: - message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation - mismatch + message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} @@ -595,8 +597,8 @@ spec: severity: critical - alert: KubeDeploymentReplicasMismatch annotations: - message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica - mismatch + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not + matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | kube_deployment_spec_replicas{job="kube-state-metrics"} @@ -607,8 +609,8 @@ spec: severity: critical - alert: KubeStatefulSetReplicasMismatch annotations: - message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica - mismatch + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | kube_statefulset_status_replicas_ready{job="kube-state-metrics"} @@ -619,8 +621,9 @@ spec: severity: critical - alert: KubeStatefulSetGenerationMismatch annotations: - message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation - mismatch + message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} @@ -629,10 +632,30 @@ spec: for: 15m labels: severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + expr: | + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical - alert: KubeDaemonSetRolloutStuck annotations: - message: Only {{$value}}% of desired pods scheduled and ready for daemon set - {{$labels.namespace}}/{{$labels.daemonset}} + message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace + }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} @@ -643,8 +666,8 @@ spec: severity: critical - alert: KubeDaemonSetNotScheduled annotations: - message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} - are not scheduled. + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} @@ -655,8 +678,8 @@ spec: severity: warning - alert: KubeDaemonSetMisScheduled annotations: - message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} - are running where they are not supposed to run. + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 @@ -676,7 +699,7 @@ spec: - alert: KubeJobCompletion annotations: message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than - 1h to complete. + one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -696,8 +719,8 @@ spec: rules: - alert: KubeCPUOvercommit annotations: - message: Overcommited CPU resource requests on Pods, cannot tolerate node - failure. + message: Cluster has overcommitted CPU resource requests for Pods and cannot + tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) @@ -710,8 +733,8 @@ spec: severity: warning - alert: KubeMemOvercommit annotations: - message: Overcommited Memory resource requests on Pods, cannot tolerate node - failure. + message: Cluster has overcommitted memory resource requests for Pods and cannot + tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) @@ -726,7 +749,7 @@ spec: severity: warning - alert: KubeCPUOvercommit annotations: - message: Overcommited CPU resource request quota on Namespaces. + message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) @@ -738,7 +761,7 @@ spec: severity: warning - alert: KubeMemOvercommit annotations: - message: Overcommited Memory resource request quota on Namespaces. + message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) @@ -750,13 +773,13 @@ spec: severity: warning - alert: KubeQuotaExceeded annotations: - message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in - namespace {{ $labels.namespace }}.' + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value + }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - kube_resourcequota{job="kube-state-metrics", type="hard"} + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 90 for: 15m labels: @@ -765,9 +788,9 @@ spec: rules: - alert: KubePersistentVolumeUsageCritical annotations: - message: The persistent volume claimed by {{ $labels.persistentvolumeclaim - }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% - free. + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value + }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} @@ -779,8 +802,8 @@ spec: severity: critical - alert: KubePersistentVolumeFullInFourDays annotations: - message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim - }} in namespace {{ $labels.namespace }} is expected to fill up within four + message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value }} bytes are available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | @@ -792,7 +815,7 @@ spec: rules: - alert: KubeNodeNotReady annotations: - message: '{{ $labels.node }} has been unready for more than an hour' + message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 @@ -825,7 +848,7 @@ spec: - alert: KubeClientErrors annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance - }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 @@ -834,8 +857,8 @@ spec: severity: warning - alert: KubeletTooManyPods annotations: - message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to - the limit of 110. + message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close + to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 @@ -845,7 +868,7 @@ spec: - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}}. + for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 @@ -855,7 +878,7 @@ spec: - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}}. + for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 @@ -864,18 +887,18 @@ spec: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is erroring for {{ $value }}% of requests. + message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is erroring for {{ $value }}% of requests. + message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) @@ -894,7 +917,7 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 1 day. + message: Kubernetes API certificate is expiring in less than 24 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 @@ -904,9 +927,8 @@ spec: rules: - alert: AlertmanagerConfigInconsistent annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. - summary: Configuration out of sync + message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` + are out of sync. expr: | count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m @@ -914,9 +936,8 @@ spec: severity: critical - alert: AlertmanagerDownOrMissing annotations: - description: An unexpected number of Alertmanagers were scraped or disappeared + message: An unexpected number of Alertmanagers were scraped or disappeared from discovery. - summary: Alertmanager down or missing expr: | label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 for: 5m @@ -924,9 +945,8 @@ spec: severity: warning - alert: AlertmanagerFailedReload annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - summary: Alertmanager's configuration reload failed expr: | alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 for: 10m @@ -936,17 +956,15 @@ spec: rules: - alert: TargetDown annotations: - description: '{{ $value }}% of the {{ $labels.job }} targets are down.' - summary: Targets are down + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: severity: warning - alert: DeadMansSwitch annotations: - description: This is a DeadMansSwitch meant to ensure that the entire alerting + message: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. - summary: Alerting DeadMansSwitch expr: vector(1) labels: severity: none From 916863e4d53a254f0497695d55014ae9d8654612 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 25 Sep 2018 17:14:30 +0200 Subject: [PATCH 419/638] contrib/kube-prometheus: Set podAntiAffinity for Prometheus & Alertmanager contrib/kube-prometheus: Generate new manifests with antiAffinity contrib/kube-prometheus: jb update Documentation: Generate after updating kube-prometheus manifests contrib/kube-prometheus: Move antiaffinity into own jsonnet file foobar --- .../kube-prometheus-anti-affinity.libsonnet | 39 +++++++++++++++++++ .../prometheus/prometheus.libsonnet | 10 +++-- 2 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet diff --git a/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet new file mode 100644 index 00000000..6956e3db --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet @@ -0,0 +1,39 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local statefulSet = k.apps.v1beta2.statefulSet; +local affinity = statefulSet.mixin.spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecutionType; +local matchExpression = affinity.mixin.podAffinityTerm.labelSelector.matchExpressionsType; + +{ + local antiaffinity(key, values) = { + affinity: { + podAntiAffinity: { + preferredDuringSchedulingIgnoredDuringExecution: [ + affinity.new() + + affinity.withWeight(100) + + affinity.mixin.podAffinityTerm.withNamespaces($._config.namespace) + + affinity.mixin.podAffinityTerm.withTopologyKey('kubernetes.io/hostname') + + affinity.mixin.podAffinityTerm.labelSelector.withMatchExpressions([ + matchExpression.new() + + matchExpression.withKey(key) + + matchExpression.withOperator('In') + + matchExpression.withValues(values), + ]), + ], + }, + }, + }, + + alertmanager+:: { + alertmanager+: { + spec+: + antiaffinity('alertmanager', [$._config.alertmanager.name]), + }, + }, + + prometheus+: { + prometheus+: { + spec+: + antiaffinity('prometheus', [$._config.prometheus.name]), + }, + }, +} diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 63df0506..452a89e3 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -144,12 +144,14 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local roleList = k.rbac.v1.roleList; roleList.new([newSpecificRole(x) for x in $._config.prometheus.namespaces]), prometheus: - local container = k.core.v1.pod.mixin.spec.containersType; + local statefulSet = k.apps.v1beta2.statefulSet; + local container = statefulSet.mixin.spec.template.spec.containersType; local resourceRequirements = container.mixin.resourcesType; - local selector = k.apps.v1beta2.deployment.mixin.spec.selectorType; + local selector = statefulSet.mixin.spec.selectorType; - local resources = resourceRequirements.new() + - resourceRequirements.withRequests({ memory: '400Mi' }); + local resources = + resourceRequirements.new() + + resourceRequirements.withRequests({ memory: '400Mi' }); { apiVersion: 'monitoring.coreos.com/v1', From 3ff5d0127734a51e1fab973ba9cdaff6ef06897b Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 28 Sep 2018 11:40:12 +0200 Subject: [PATCH 420/638] contrib: remove duplicate alertmanager alert --- .../kube-prometheus/alerts/alertmanager.libsonnet | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index f3bbd0ea..87363b2b 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -17,19 +17,6 @@ severity: 'critical', }, }, - { - alert: 'AlertmanagerDownOrMissing', - annotations: { - message: 'An unexpected number of Alertmanagers were scraped or disappeared from discovery.', - }, - expr: ||| - label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - }, { alert: 'AlertmanagerFailedReload', annotations: { From 7d97b71a62fae5a4271efcd8cd5c92584fddc211 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 28 Sep 2018 11:47:39 +0200 Subject: [PATCH 421/638] contrib: regenerate --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 4fc52ff8..c8290a1a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "2694cabc85ed89b3c8ac0865bcbc29d72e52eb2f" + "version": "ad23783d41d47f04bb1262ab232d4a9c160570c6" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 899c2ecc..2f1cd4ab 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -934,15 +934,6 @@ spec: for: 5m labels: severity: critical - - alert: AlertmanagerDownOrMissing - annotations: - message: An unexpected number of Alertmanagers were scraped or disappeared - from discovery. - expr: | - label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 - for: 5m - labels: - severity: warning - alert: AlertmanagerFailedReload annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace From 0f0c168b44bf8cdf489b1a6bf0490fa72919d118 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 13 Sep 2018 15:07:26 +0200 Subject: [PATCH 422/638] *: Update to Prometheus v2.4.0 --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 63df0506..7f5cfbdc 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.3.2', + prometheus: 'v2.4.2', }, imageRepos+:: { From 263b2f2bc0a0fea1e3e91355470d86c5bde75523 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Fri, 28 Sep 2018 15:22:55 +0200 Subject: [PATCH 423/638] kube-prometheus: Update jsonnet dependencies --- jsonnetfile.lock.json | 2 +- manifests/prometheus-prometheus.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index c8290a1a..d5fabd04 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ad23783d41d47f04bb1262ab232d4a9c160570c6" + "version": "50b1dbe739d9e4a59fb936b1733f8e53c86de897" }, { "name": "ksonnet", diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 37d0e663..89d69c9f 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -25,4 +25,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.3.2 + version: v2.4.2 From 6c7c756b265994f55b37f57d10ec066032790c7d Mon Sep 17 00:00:00 2001 From: Robert Nemeti Date: Mon, 1 Oct 2018 10:26:08 +0200 Subject: [PATCH 424/638] objectify alertmanager config this way it will be possible to merge configs --- .../alertmanager/alertmanager.libsonnet | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index a9e9c037..ad3fb0e8 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -14,22 +14,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; alertmanager+:: { name: $._config.alertmanager.name, - config: ||| - global: - resolve_timeout: 5m - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: DeadMansSwitch - receiver: 'null' - receivers: - - name: 'null' - |||, + config: { + global: { + resolve_timeout: '5m', + }, + route: { + group_by: ['job'], + group_wait: '30s', + group_interval: '5m', + repeat_interval: '12h', + receiver: 'null', + routes: [ + { + receiver: 'null', + match: { + alertname: 'DeadMansSwitch', + }, + }, + ], + }, + receivers: [ + { + name: 'null', + }, + ], + }, replicas: 3, }, }, @@ -38,7 +47,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; secret: local secret = k.core.v1.secret; - secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64(std.manifestYamlDoc($._config.alertmanager.config)) }) + secret.mixin.metadata.withNamespace($._config.namespace), serviceAccount: From eca6e624020105902ab1a3fd677efa19618c4f47 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 1 Oct 2018 14:11:04 +0200 Subject: [PATCH 425/638] contrib/kube-prometheus: Drop etcd metrics by apiserver & kube controller --- .../prometheus/prometheus.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 3efac32a..809cb1e9 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -308,6 +308,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { port: 'http-metrics', interval: '30s', + metricRelabelings: [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|request|server).*', + action: 'drop', + }, + ], }, ], selector: { @@ -356,6 +363,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; serverName: 'kubernetes', }, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|request|server).*', + action: 'drop', + }, + ], }, ], }, From 32a24f07cb97293a36a68c5234d5945f14fc9b2c Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 1 Oct 2018 14:51:10 +0200 Subject: [PATCH 426/638] contrib/kube-prometheus: Generate new manifests after dropping etcd labels --- jsonnetfile.lock.json | 6 +++--- manifests/prometheus-serviceMonitorApiserver.yaml | 5 +++++ .../prometheus-serviceMonitorKubeControllerManager.yaml | 5 +++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index d5fabd04..4a773925 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "50b1dbe739d9e4a59fb936b1733f8e53c86de897" + "version": "004e648d186bc7be6f1f519da26f96bc2533f1b6" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "94aef231932810633416bfe596a41dbad2b1ebb9" + "version": "bce24b0b087f7dc09c9e9f066f3e554a851792e9" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a3e242d80ae1a13ae57904fc12e91fe4c9ecf972" + "version": "c74998267c71ef4a0fa847ce16d620b7fe3580bf" } ] } diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml index 0cffe541..6d884a2b 100644 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ b/manifests/prometheus-serviceMonitorApiserver.yaml @@ -9,6 +9,11 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 30s + metricRelabelings: + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ port: https scheme: https tlsConfig: diff --git a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml index dfb2a25d..153a90da 100644 --- a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml +++ b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml @@ -8,6 +8,11 @@ metadata: spec: endpoints: - interval: 30s + metricRelabelings: + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ port: http-metrics jobLabel: k8s-app namespaceSelector: From bb6b5099dc4645204bf445d3dad958156c256fad Mon Sep 17 00:00:00 2001 From: Robert Nemeti Date: Tue, 2 Oct 2018 15:14:21 +0200 Subject: [PATCH 427/638] allow backward compatibility with the textblock --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index ad3fb0e8..91cf05f1 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -47,7 +47,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; secret: local secret = k.core.v1.secret; - secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64(std.manifestYamlDoc($._config.alertmanager.config)) }) + + if std.type($._config.alertmanager.config) == "object" then + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64(std.manifestYamlDoc($._config.alertmanager.config)) }) + else + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + + secret.mixin.metadata.withNamespace($._config.namespace), serviceAccount: From 5c393ba3ee71eff61748c03694686641999c7b07 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 9 Oct 2018 16:09:55 +0200 Subject: [PATCH 428/638] kube-prometheus/README.md: Add jb update to contrib guide With pinning the kube-prometheus dependency for the generated `/manifest` folder, one needs to update the `jsonnetfile.lock.json` on `*.jsonnet` file changes. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 84bf6c28..84035310 100644 --- a/README.md +++ b/README.md @@ -497,5 +497,7 @@ the following process: 1. Make your changes in the respective `*.jsonnet` file. 2. Commit your changes (This is currently necessary due to our vendoring process. This is likely to change in the future). +3. Update the pinned kube-prometheus dependency in `jsonnetfile.lock.json`: `jb + update`. 3. Generate dependent `*.yaml` files: `make generate-in-docker`. 4. Commit the generated changes. From be55290b3fdd78656812df01a9934eee0243e6e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20R=C3=BCger?= Date: Mon, 8 Oct 2018 14:09:34 +0200 Subject: [PATCH 429/638] *: Update to Prometheus v2.4.3 --- README.md | 2 +- .../prometheus/prometheus.libsonnet | 2 +- jsonnetfile.lock.json | 8 +++---- manifests/grafana-deployment.yaml | 2 +- manifests/prometheus-prometheus.yaml | 2 +- manifests/prometheus-rules.yaml | 23 ++++++++++++++++--- 6 files changed, 28 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 84bf6c28..cc24beaf 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ These are the available fields with their respective default values: kubeRbacProxy: "v0.3.1", addonResizer: "1.0", prometheusOperator: "v0.23.2", - prometheus: "v2.3.2", + prometheus: "v2.4.3", }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 809cb1e9..d6cbc5fe 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.4.2', + prometheus: 'v2.4.3', }, imageRepos+:: { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 4a773925..1fe6b56b 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "004e648d186bc7be6f1f519da26f96bc2533f1b6" + "version": "e53530d13d400496721104c2d30f52fe2b6ff427" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "19da1eb2f2558dad0f8d9e280cc1fe7bc835677b" + "version": "d24c4066aa2653370e1403812202eb38b2e70210" }, { "name": "grafonnet", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "9a20f81c9007e4c7409dd0b3edda1a7a78ad2c63" + "version": "850525cfa7a82115cf7a8a85f5ca632f4632be3d" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "c74998267c71ef4a0fa847ce16d620b7fe3580bf" + "version": "ba606bf85edfb3007f27a97d41f54e3fe3f70ce6" } ] } diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index cb8cc9d8..e378f689 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.2.1 + - image: grafana/grafana:5.2.4 name: grafana ports: - containerPort: 3000 diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 89d69c9f..ae18cd67 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -25,4 +25,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.4.2 + version: v2.4.3 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 2f1cd4ab..db10ae31 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -565,10 +565,10 @@ spec: - alert: KubePodCrashLooping annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf "%.2f" $value }} times / second. + }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical @@ -784,6 +784,17 @@ spec: for: 15m labels: severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace + }} for {{ $labels.container_name }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by + (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total[5m])) + by (container_name, pod_name, namespace)\n > 25 \n" + for: 15m + labels: + severity: warning - name: kubernetes-storage rules: - alert: KubePersistentVolumeUsageCritical @@ -807,7 +818,13 @@ spec: days. Currently {{ $value }} bytes are available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - kubelet_volume_stats_available_bytes{job="kubelet"} and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + ( + kubelet_volume_stats_used_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) > 0.85 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m labels: severity: critical From 5cb0e51cbfe4d507dfb163ea5a720d00b77ac95b Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 10 Oct 2018 11:33:36 +0200 Subject: [PATCH 430/638] contrib/kube-prometheus: Fix version for kube-prometheus --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 1fe6b56b..443c00a7 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e53530d13d400496721104c2d30f52fe2b6ff427" + "version": "049c48c931bfb3cd72efd313b7a47d2244456db0" }, { "name": "ksonnet", From acd8924d5789286864418ca17d9ba72b2aac5a13 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 8 Oct 2018 15:29:18 +0200 Subject: [PATCH 431/638] Add triggered_total metric to alertmanager controller Update client_golang for wrappable registerer --- .../alerts/alertmanager.libsonnet | 2 +- .../alerts/prometheus-operator.libsonnet | 19 +++---------------- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 18 +++++------------- 4 files changed, 10 insertions(+), 31 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index 87363b2b..c2e440c5 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -10,7 +10,7 @@ message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', }, expr: ||| - count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 ||| % $._config, 'for': '5m', labels: { diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet index f851caa0..a430c505 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -5,28 +5,15 @@ name: 'prometheus-operator', rules: [ { - alert: 'PrometheusOperatorAlertmanagerReconcileErrors', + alert: 'PrometheusOperatorReconcileErrors', expr: ||| - rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.', - }, - 'for': '10m', - }, - { - alert: 'PrometheusOperatorPrometheusReconcileErrors', - expr: ||| - rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', + message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.', }, 'for': '10m', }, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 443c00a7..8ed8c60e 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "049c48c931bfb3cd72efd313b7a47d2244456db0" + "version": "d874b5bc21649dd9d07ab42dd3bdea515038953e" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index db10ae31..519997c6 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -947,7 +947,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m labels: severity: critical @@ -1099,20 +1099,12 @@ spec: severity: warning - name: prometheus-operator rules: - - alert: PrometheusOperatorAlertmanagerReconcileErrors + - alert: PrometheusOperatorReconcileErrors annotations: - message: Errors while reconciling Alertmanager in {{ $labels.namespace }} - Namespace. + message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace + }} Namespace. expr: | - rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorPrometheusReconcileErrors - annotations: - message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - expr: | - rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 for: 10m labels: severity: warning From f3125f2f6539f28f2478f53531feac2e6a174213 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 10 Oct 2018 18:11:39 +0200 Subject: [PATCH 432/638] contrib/kube-prometheus: jb update after squashing --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 8ed8c60e..e684fedc 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "d874b5bc21649dd9d07ab42dd3bdea515038953e" + "version": "e316fd4c4d1d39a5490c02a2b76e239a5378a8f2" }, { "name": "ksonnet", From 36c94517529a6cb67a8694fd22f8b0727986d8c3 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 12 Oct 2018 13:46:56 +0200 Subject: [PATCH 433/638] Update kube-prometheus' dependency of prometheus-operator to v0,24 --- README.md | 2 +- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6fc03aec..252ddd4b 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ These are the available fields with their respective default values: kubeStateMetrics: "v1.3.1", kubeRbacProxy: "v0.3.1", addonResizer: "1.0", - prometheusOperator: "v0.23.2", + prometheusOperator: "v0.24.2", prometheus: "v2.4.3", }, diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index bb4b79f7..96e1fe49 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.23.2" + "version": "v0.24.0" }, { "name": "etcd-mixin", From e2476ce2ef254d43ca2614c4f2eb0ab39dc20991 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 12 Oct 2018 13:55:59 +0200 Subject: [PATCH 434/638] contrib/kube-prometheus: Generate new manifests and docs with prom operator v0.24 --- README.md | 2 +- jsonnetfile.lock.json | 4 +- ...0alertmanagerCustomResourceDefinition.yaml | 26 +++++++-- ...r-0prometheusCustomResourceDefinition.yaml | 29 ++++++++-- ...rometheusruleCustomResourceDefinition.yaml | 4 +- ...ervicemonitorCustomResourceDefinition.yaml | 56 +++++++++++++++++++ .../0prometheus-operator-clusterRole.yaml | 1 + .../0prometheus-operator-deployment.yaml | 4 +- 8 files changed, 111 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 252ddd4b..36aea2cb 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ These are the available fields with their respective default values: kubeStateMetrics: "v1.3.1", kubeRbacProxy: "v0.3.1", addonResizer: "1.0", - prometheusOperator: "v0.24.2", + prometheusOperator: "v0.24.0", prometheus: "v2.4.3", }, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e684fedc..f4c615b8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e316fd4c4d1d39a5490c02a2b76e239a5378a8f2" + "version": "13cb3c515152fb8495cdc6364938f19bff860e70" }, { "name": "ksonnet", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "27b1eb72d9d93e5ab447121e2b884bc558bde01d" + "version": "4a7fea51ab3f10329472c07028354617fb6635fe" }, { "name": "etcd-mixin", diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 6ee94918..7c5ff668 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -26,6 +26,12 @@ spec: description: 'AlertmanagerSpec is a specification of the desired behavior of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: + additionalPeers: + description: AdditionalPeers allows injecting a set of additional Alertmanagers + to peer with to form a highly available cluster. + items: + type: string + type: array affinity: description: Affinity is a group of affinity scheduling rules. properties: @@ -1668,6 +1674,9 @@ spec: Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids type: string + priorityClassName: + description: Priority class assigned to the Pods + type: string replicas: description: Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the running cluster @@ -1689,7 +1698,8 @@ spec: type: object retention: description: Time duration Alertmanager shall retain data for. Default - is '120h'. + is '120h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` + (milliseconds seconds minutes hours days weeks years). type: string routePrefix: description: The route prefix Alertmanager registers HTTP handlers for. @@ -1793,14 +1803,22 @@ spec: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. type: string + sha: + description: SHA of Alertmanager container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string storage: description: StorageSpec defines the configured storage for a group - Prometheus servers. + Prometheus servers. If neither `emptyDir` nor `volumeClaimTemplate` + is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) + will be used. properties: class: description: 'Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses - DEPRECATED' + (DEPRECATED - instead use `volumeClaimTemplate.spec.storageClassName`)' type: string emptyDir: description: Represents an empty directory for a pod. Empty directory @@ -2331,7 +2349,7 @@ spec: type: string tag: description: Tag of Alertmanager container image to be deployed. Defaults - to the value of `version`. + to the value of `version`. Version is ignored if Tag is set. type: string tolerations: description: If specified, the pod's tolerations. diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 6eba60db..0117d343 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1830,6 +1830,9 @@ spec: Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids type: string + priorityClassName: + description: Priority class assigned to the Pods + type: string remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. @@ -2095,7 +2098,8 @@ spec: type: object retention: description: Time duration Prometheus shall retain data for. Default - is '24h'. + is '24h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` + (milliseconds seconds minutes hours days weeks years). type: string routePrefix: description: The route prefix Prometheus registers HTTP handlers for. @@ -2374,14 +2378,22 @@ spec: "In", and the values array contains only "value". The requirements are ANDed. type: object + sha: + description: SHA of Prometheus container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string storage: description: StorageSpec defines the configured storage for a group - Prometheus servers. + Prometheus servers. If neither `emptyDir` nor `volumeClaimTemplate` + is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) + will be used. properties: class: description: 'Name of the StorageClass to use when requesting storage provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses - DEPRECATED' + (DEPRECATED - instead use `volumeClaimTemplate.spec.storageClassName`)' type: string emptyDir: description: Represents an empty directory for a pod. Empty directory @@ -2912,7 +2924,7 @@ spec: type: string tag: description: Tag of Prometheus container image to be deployed. Defaults - to the value of `version`. + to the value of `version`. Version is ignored if Tag is set. type: string thanos: description: ThanosSpec defines parameters for a Prometheus server within @@ -3015,9 +3027,16 @@ spec: description: Whether to use S3 Signature Version 2; otherwise Signature Version 4 will be used. type: boolean + sha: + description: SHA of Thanos container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string tag: description: Tag of Thanos sidecar container image to be deployed. - Defaults to the value of `version`. + Defaults to the value of `version`. Version is ignored if Tag + is set. type: string version: description: Version describes the version of Thanos to use. diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index 0a9873c9..52587192 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -322,7 +322,9 @@ spec: annotations: type: object expr: - type: string + anyOf: + - type: string + - type: integer for: type: string labels: diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 9d96bfeb..d2e310fd 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -136,6 +136,51 @@ spec: description: ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint. type: string + relabelings: + description: 'RelabelConfigs to apply to samples before ingestion. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#' + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array scheme: description: HTTP scheme to use for scraping. type: string @@ -181,6 +226,17 @@ spec: items: type: string type: array + podTargetLabels: + description: PodTargetLabels transfers labels on the Kubernetes Pod + onto the target. + items: + type: string + type: array + sampleLimit: + description: SampleLimit defines per-scrape limit on number of scraped + samples that will be accepted. + format: int64 + type: integer selector: description: A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index bad68f27..e0ac283a 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -61,5 +61,6 @@ rules: resources: - namespaces verbs: + - get - list - watch diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 6894d92c..9c9c485a 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.2 - image: quay.io/coreos/prometheus-operator:v0.23.2 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.24.0 + image: quay.io/coreos/prometheus-operator:v0.24.0 name: prometheus-operator ports: - containerPort: 8080 From e1ed50158bdcd121ae167328660dd0d937f5d501 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 12 Oct 2018 14:30:33 +0200 Subject: [PATCH 435/638] contrib/kube-prometheus: Fix AlertmanagerConfigInconsistent alert expression --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index c2e440c5..c8aba879 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -10,7 +10,7 @@ message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', }, expr: ||| - count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 ||| % $._config, 'for': '5m', labels: { From 1655e572a6e381fbd0b4dcd048c747da0fbaf9fe Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 12 Oct 2018 14:35:58 +0200 Subject: [PATCH 436/638] contrib/kube-prometheus: Generate AlertmanagerConfigInconsistent --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index f4c615b8..b059a46c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "13cb3c515152fb8495cdc6364938f19bff860e70" + "version": "949ffab68a175c0100cb5b9ac84a47d19752f868" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 519997c6..3a427753 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -947,7 +947,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical From 6b3dc1d41c47b5684a3315b89f98ea4abbc8d474 Mon Sep 17 00:00:00 2001 From: Joshua Olson Date: Fri, 12 Oct 2018 12:02:50 -0500 Subject: [PATCH 437/638] Add some commands to ensure kube-prometheus is fully successfully deployed. --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 36aea2cb..bd8b587d 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,12 @@ Though for a quickstart a compiled version of the Kubernetes [manifests](manifes * Simply create the stack: ``` $ kubectl create -f manifests/ || true -$ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice + +# It can take a few seconds for the above 'create manifests' command to fully create the following resources, so verify the resources are ready before proceeding. +until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done +until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done + +$ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice (to workaround a race condition). ``` * And to teardown the stack: ``` From 466cbe50b0a6f407a2a5b12ddcca0015967eacb6 Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Mon, 1 Oct 2018 17:06:16 +0100 Subject: [PATCH 438/638] contrib/kube-prometheus: Add a script to sync images to an internal registry Crazy at it sounds, some Kubernetes installations don't have access to Internet and source all their images from an internal registry. sync-to-internal-registry.jsonnet is a jsonnet snippet that helps with the task of pushing upstream images used by the prometheus operator to an internal registry by printing the right docker pull/tag/push commands. $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet docker pull quay.io/coreos/addon-resizer:1.0 docker tag quay.io/coreos/addon-resizer:1.0 internal-registry.com/organization/addon-resizer:1.0 docker push internal-registry.com/organization/addon-resizer:1.0 docker pull quay.io/prometheus/alertmanager:v0.15.2 docker tag quay.io/prometheus/alertmanager:v0.15.2 internal-registry.com/organization/alertmanager:v0.15.2 docker push internal-registry.com/organization/alertmanager:v0.15.2 ... --- jsonnet/kube-prometheus/lib/image.libsonnet | 21 +++++++++++++++ jsonnet/kube-prometheus/lib/lib.libsonnet | 1 + sync-to-internal-registry.jsonnet | 30 +++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 jsonnet/kube-prometheus/lib/image.libsonnet create mode 100644 jsonnet/kube-prometheus/lib/lib.libsonnet create mode 100644 sync-to-internal-registry.jsonnet diff --git a/jsonnet/kube-prometheus/lib/image.libsonnet b/jsonnet/kube-prometheus/lib/image.libsonnet new file mode 100644 index 00000000..0561e33c --- /dev/null +++ b/jsonnet/kube-prometheus/lib/image.libsonnet @@ -0,0 +1,21 @@ +// imageName extracts the image name from a fully qualified image string. eg. +// quay.io/coreos/addon-resizer -> addon-resizer +// grafana/grafana -> grafana +local imageName(image) = + local parts = std.split(image, '/'); + local len = std.length(parts); + if len == 3 then + # registry.com/org/image + parts[2] + else if len == 2 then + # org/image + parts[1] + else if len == 1 then + # image, ie. busybox + parts[0] + else + error 'unknown image format: ' + image; + +{ + imageName:: imageName, +} diff --git a/jsonnet/kube-prometheus/lib/lib.libsonnet b/jsonnet/kube-prometheus/lib/lib.libsonnet new file mode 100644 index 00000000..c30f976f --- /dev/null +++ b/jsonnet/kube-prometheus/lib/lib.libsonnet @@ -0,0 +1 @@ +(import 'image.libsonnet') diff --git a/sync-to-internal-registry.jsonnet b/sync-to-internal-registry.jsonnet new file mode 100644 index 00000000..f0cf35ae --- /dev/null +++ b/sync-to-internal-registry.jsonnet @@ -0,0 +1,30 @@ +local kp = import 'kube-prometheus/kube-prometheus.libsonnet'; +local l = import 'kube-prometheus/lib/lib.libsonnet'; +local config = kp._config; + +local makeImages(config) = [ + { + name: config.imageRepos[image], + tag: config.versions[image], + } + for image in std.objectFields(config.imageRepos) +]; + +local upstreamImage(image) = '%s:%s' % [image.name, image.tag]; +local downstreamImage(registry, image) = '%s/%s:%s' % [registry, l.imageName(image.name), image.tag]; + +local pullPush(image, newRegistry) = [ + 'docker pull %s' % upstreamImage(image), + 'docker tag %s %s' % [upstreamImage(image), downstreamImage(newRegistry, image)], + 'docker push %s' % downstreamImage(newRegistry, image), +]; + +local images = makeImages(config); + +local output(repository) = std.flattenArrays([ + pullPush(image, repository) + for image in images +]); + +function(repository="my-registry.com/repository") + std.join('\n', output(repository)) From 253abe0f965e06c5ec79e78cd67c732197c4db4a Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Mon, 1 Oct 2018 17:42:07 +0100 Subject: [PATCH 439/638] contrib/kube-prometheus: Introduce a withImageRepository mixin This mixin replaces all images prefixes by $repository to generate manifests that will point to an internal registry. --- .../kube-prometheus-config-mixins.libsonnet | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet diff --git a/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet new file mode 100644 index 00000000..ad278407 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet @@ -0,0 +1,20 @@ +local l = import 'lib/lib.libsonnet'; + +// withImageRepository is a mixin that replaces all images prefixes by repository. eg. +// quay.io/coreos/addon-resizer -> $repository/addon-resizer +// grafana/grafana -> grafana $repository/grafana +local withImageRepository(repository) = { + local oldRepos = super._config.imageRepos, + local substituteRepository(image, repository) = + if repository == null then image else repository + '/' + l.imageName(image), + _config+:: { + imageRepos:: { + [field]: substituteRepository(oldRepos[field], repository), + for field in std.objectFields(oldRepos) + } + }, +}; + +{ + withImageRepository:: withImageRepository, +} From 33e4ff2134f768e65f3c57c2fb1d124fdf583441 Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Mon, 1 Oct 2018 18:04:45 +0100 Subject: [PATCH 440/638] contrib/kube-prometheus: Document the internal registry support --- README.md | 40 ++++++++++++++++++++++++++++++ examples/internal-registry.jsonnet | 14 +++++++++++ 2 files changed, 54 insertions(+) create mode 100644 examples/internal-registry.jsonnet diff --git a/README.md b/README.md index bd8b587d..9d5cc8c1 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [Configuration](#configuration) * [Customization Examples](#customization-examples) * [Cluster Creation Tools](#cluster-creation-tools) + * [Internal Registries](#internal-registries) * [NodePorts](#nodeports) * [Prometheus Object Name](#prometheus-object-name) * [node-exporter DaemonSet namespace](#node-exporter-daemonset-namespace) @@ -325,6 +326,45 @@ kops: (import 'kube-prometheus/kube-prometheus-kops.libsonnet') ``` +### Internal Registry + +Some Kubernetes installations source all their images from an internal registry. kube-prometheus supports this use case and helps the user synchronize every image it uses to the internal registry and generate manifests pointing at the internal registry. + +To produce the `docker pull/tag/push` commands that will synchronize upstream images to `internal-registry.com/organization` (after having run the `jb` command to populate the vendor directory): + +```shell +$ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet +docker pull quay.io/coreos/addon-resizer:1.0 +docker tag quay.io/coreos/addon-resizer:1.0 internal-registry.com/organization/addon-resizer:1.0 +docker push internal-registry.com/organization/addon-resizer:1.0 +docker pull quay.io/prometheus/alertmanager:v0.15.2 +docker tag quay.io/prometheus/alertmanager:v0.15.2 internal-registry.com/organization/alertmanager:v0.15.2 +docker push internal-registry.com/organization/alertmanager:v0.15.2 +... +``` + +The output of this command can be piped to a shell to be executed by appending `| sh`. + +Then to generate manifests with `internal-registry.com/organization`, use the `withImageRepository` mixin: + +[embedmd]:# (examples/internal-registry.jsonnet) +```jsonnet +local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +} + mixin.withImageRepository('internal-registry.com/organization'); + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + ### NodePorts Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts: diff --git a/examples/internal-registry.jsonnet b/examples/internal-registry.jsonnet new file mode 100644 index 00000000..f1d1e8ac --- /dev/null +++ b/examples/internal-registry.jsonnet @@ -0,0 +1,14 @@ +local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +} + mixin.withImageRepository('internal-registry.com/organization'); + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } From c096a8fac8098732a92ddc7affc29befdc5b2897 Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Mon, 1 Oct 2018 22:41:50 +0100 Subject: [PATCH 441/638] kube-prometheus: Update kube-prometheus vendoring --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b059a46c..b350b259 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "949ffab68a175c0100cb5b9ac84a47d19752f868" + "version": "b0fb3d478adaf8a9749137921642d4c794a66bbd" }, { "name": "ksonnet", From 0eabd2e536545f4bac3091fa4f85ffa8eb107f79 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 16 Oct 2018 14:33:14 +0200 Subject: [PATCH 442/638] kube-prometheus: Fix Alertmanager secret namespace. --- .../alertmanager/alertmanager.libsonnet | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 91cf05f1..a6d9e8e6 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -25,7 +25,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; repeat_interval: '12h', receiver: 'null', routes: [ - { + { receiver: 'null', match: { alertname: 'DeadMansSwitch', @@ -35,7 +35,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, receivers: [ { - name: 'null', + name: 'null', }, ], }, @@ -47,12 +47,12 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; secret: local secret = k.core.v1.secret; - if std.type($._config.alertmanager.config) == "object" then - secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64(std.manifestYamlDoc($._config.alertmanager.config)) }) + if std.type($._config.alertmanager.config) == 'object' then + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64(std.manifestYamlDoc($._config.alertmanager.config)) }) + + secret.mixin.metadata.withNamespace($._config.namespace) else - secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) - + - secret.mixin.metadata.withNamespace($._config.namespace), + secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) + + secret.mixin.metadata.withNamespace($._config.namespace), serviceAccount: local serviceAccount = k.core.v1.serviceAccount; From 2cea4ee48204b3d90de3a1813b9712ceaf3f94dc Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 16 Oct 2018 14:53:31 +0200 Subject: [PATCH 443/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 6 +++--- manifests/alertmanager-secret.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b350b259..d1176893 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "b0fb3d478adaf8a9749137921642d4c794a66bbd" + "version": "50038889fade4d7a17352592cd268d9d1e709b93" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "ba606bf85edfb3007f27a97d41f54e3fe3f70ce6" + "version": "7a759c18d294698f537f8be91927354818a71e51" } ] -} +} \ No newline at end of file diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 4a143fbb..79fc7a21 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== + alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== kind: Secret metadata: name: alertmanager-main From 6c462aefa6f40f4b33c7affdc9f2f424f29bb5dc Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Tue, 16 Oct 2018 13:58:51 +0100 Subject: [PATCH 444/638] contrib/kube-prometheus: Bump the prometheus-operator jsonnet dep kube-prometheus now contains a script, sync-to-internal-registry.jsonnet, that depends on commit 8c6a68760010347134e41f3aa3d73c68eb094a1b to work. Bump the prometheus-operator hash accordingly. --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b350b259..2d290153 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "4a7fea51ab3f10329472c07028354617fb6635fe" + "version": "8c6a68760010347134e41f3aa3d73c68eb094a1b" }, { "name": "etcd-mixin", From 71f51bd12dcefa7565ee02583f26a80f2455001f Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Thu, 18 Oct 2018 16:34:04 +1300 Subject: [PATCH 445/638] kube-prometheus: Example jsonnet ingress, add externalURL --- examples/ingress.jsonnet | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 149fea9f..48bc9ada 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -11,6 +11,13 @@ local kp = _config+:: { namespace: 'monitoring', }, + prometheus+:: { + prometheus+: { + spec+: { + externalURL: 'http://prometheus.example.com', + }, + }, + }, ingress+:: { 'prometheus-k8s': ingress.new() + From 012a60163ea9619321df8968542c14ab85bcdd3b Mon Sep 17 00:00:00 2001 From: kkc Date: Sat, 20 Oct 2018 00:15:02 +0800 Subject: [PATCH 446/638] contrib: fix typo in contrib prometheus rules Change `Promehteus` to `Prometheus` --- jsonnet/kube-prometheus/alerts/prometheus.libsonnet | 2 +- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet index 99be08ff..b188faa2 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -8,7 +8,7 @@ alert: 'PrometheusConfigReloadFailed', annotations: { description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}", - summary: "Reloading Promehteus' configuration failed", + summary: "Reloading Prometheus' configuration failed", }, expr: ||| prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 3a427753..0565661a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1001,7 +1001,7 @@ spec: - alert: PrometheusConfigReloadFailed annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - summary: Reloading Promehteus' configuration failed + summary: Reloading Prometheus' configuration failed expr: | prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 for: 10m From 19e94092d659546e22a0542155e46dddfb19dfa9 Mon Sep 17 00:00:00 2001 From: kkc Date: Sat, 20 Oct 2018 00:15:49 +0800 Subject: [PATCH 447/638] Regenerate jsonnetfile.lock.json --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 46843efe..390820de 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "50038889fade4d7a17352592cd268d9d1e709b93" + "version": "230f671aab6ece196aa7a0f618da38173b4daf71" }, { "name": "ksonnet", @@ -81,4 +81,4 @@ "version": "7a759c18d294698f537f8be91927354818a71e51" } ] -} \ No newline at end of file +} From 352c9171cf69e0c9b2ef9eba14e77e9b72ba5ec5 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Sat, 20 Oct 2018 15:54:04 +1300 Subject: [PATCH 448/638] kube-prometheus: Example jsonnet ingress, add alertmanager and grafana --- examples/ingress.jsonnet | 64 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 48bc9ada..8d98a10c 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -11,6 +11,23 @@ local kp = _config+:: { namespace: 'monitoring', }, + // Configure External URL's per application + alertmanager+:: { + alertmanager+: { + spec+: { + externalURL: 'http://alertmanager.example.com', + }, + }, + }, + grafana+:: { + config: { + sections: { + server: { + root_url: 'http://grafana.example.com/', + }, + }, + }, + }, prometheus+:: { prometheus+: { spec+: { @@ -18,10 +35,47 @@ local kp = }, }, }, + // Create ingress objects per application ingress+:: { - 'prometheus-k8s': + alertmanager: ingress.new() + - ingress.mixin.metadata.withName('prometheus-k8s') + + ingress.mixin.metadata.withName('alertmanager-main') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'nginx.ingress.kubernetes.io/auth-type': 'basic', + 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost('alertmanager.example.com') + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.mixin.backend.withServiceName('alertmanager-main') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + grafana: + ingress.new() + + ingress.mixin.metadata.withName('grafana') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'nginx.ingress.kubernetes.io/auth-type': 'basic', + 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost('grafana.example.com') + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.mixin.backend.withServiceName('grafana') + + httpIngressPath.mixin.backend.withServicePort('http') + ), + ), + prometheus: + ingress.new() + + ingress.mixin.metadata.withName('prometheus') + ingress.mixin.metadata.withNamespace($._config.namespace) + ingress.mixin.metadata.withAnnotations({ 'nginx.ingress.kubernetes.io/auth-type': 'basic', @@ -39,6 +93,7 @@ local kp = ), }, } + { + // Create basic auth secret - replace 'auth' file with your own ingress+:: { 'basic-auth-secret': secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + @@ -46,7 +101,4 @@ local kp = }, }; -k.core.v1.list.new([ - kp.ingress['prometheus-k8s'], - kp.ingress['basic-auth-secret'], -]) +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } From f6a3ac030ac9877d3d6505fb762e14e874afe9c0 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Sun, 21 Oct 2018 08:29:33 +1300 Subject: [PATCH 449/638] kube-prometheus: Example jsonnet ingress, merge Grafana config instead of overwrite --- examples/ingress.jsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 8d98a10c..cfc6dbbf 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -20,9 +20,9 @@ local kp = }, }, grafana+:: { - config: { - sections: { - server: { + config+: { + sections+: { + server+: { root_url: 'http://grafana.example.com/', }, }, From 8094f67a40e4ff555713c3cb01c08f3ffed2ff42 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 22 Oct 2018 18:38:44 +0200 Subject: [PATCH 450/638] contrib/kube-prometheus: thanos-peers service misses namespace --- jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index 4ef00612..28b76dff 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -57,8 +57,8 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; thanosPeerService: local thanosPeerPort = servicePort.newNamed('cluster', 10900, 'cluster'); service.new('thanos-peers', { 'thanos-peer': 'true' }, thanosPeerPort) + + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.spec.withType('ClusterIP') + service.mixin.spec.withClusterIp('None'), - }, } From f63ebc87e0aedf093c9e63800d4252321126c866 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 24 Oct 2018 14:54:08 +0200 Subject: [PATCH 451/638] kube-prometheus: Bump prometheus-operator to v0.25.0 --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 96e1fe49..d42e422a 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.24.0" + "version": "v0.25.0" }, { "name": "etcd-mixin", From 830500cfc71a559a397775b408b502108671e445 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 24 Oct 2018 15:29:20 +0200 Subject: [PATCH 452/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 14 ++-- ...0alertmanagerCustomResourceDefinition.yaml | 61 +++++++++++--- ...r-0prometheusCustomResourceDefinition.yaml | 80 +++++++++++++++---- ...rometheusruleCustomResourceDefinition.yaml | 11 +-- .../0prometheus-operator-deployment.yaml | 4 +- manifests/prometheus-rules.yaml | 3 +- 6 files changed, 130 insertions(+), 43 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 390820de..a62c4f16 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "230f671aab6ece196aa7a0f618da38173b4daf71" + "version": "72cd517798083622b2d31943e3f4e38b6b1a204f" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "d24c4066aa2653370e1403812202eb38b2e70210" + "version": "9d393239bd361c6ff9883f6d8c8e9bf0b1f1dd13" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "64147daa1267a2571ef95609550b782ec9807c52" + "version": "bce2b3ae55983435f175045d59d0d5431570e120" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "bce24b0b087f7dc09c9e9f066f3e554a851792e9" + "version": "282ae11a6f4fa47bf844a68f8a3eee9dd26a14be" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "850525cfa7a82115cf7a8a85f5ca632f4632be3d" + "version": "455e08134e1a135f41e1032576487921a759cf51" }, { "name": "prometheus-operator", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "8c6a68760010347134e41f3aa3d73c68eb094a1b" + "version": "82a6ad2071ff653e38b3b4719ecb789d73f3ab05" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "7a759c18d294698f537f8be91927354818a71e51" + "version": "965ba5ca8bbf015b68abe3ad45c99270dc1022fb" } ] } diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 7c5ff668..22248a54 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -593,6 +593,13 @@ spec: baseImage: description: Base image that is used to deploy pods, without tag. type: string + configMaps: + description: ConfigMaps is a list of ConfigMaps in the same namespace + as the Alertmanager object, which shall be mounted into the Alertmanager + Pods. The ConfigMaps are mounted into /etc/alertmanager/configmaps/. + items: + type: string + type: array containers: description: Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager @@ -1057,8 +1064,8 @@ spec: to by services. type: string protocol: - description: Protocol for port. Must be UDP or TCP. Defaults - to "TCP". + description: Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". type: string required: - containerPort @@ -1211,6 +1218,13 @@ spec: privileged containers are essentially equivalent to root on the host. Defaults to false. type: boolean + procMount: + description: procMount denotes the type of proc mount to use + for the containers. The default is DefaultProcMount which + uses the container runtime defaults for readonly paths and + masked paths. This requires the ProcMountType feature flag + to be enabled. + type: string readOnlyRootFilesystem: description: Whether this container has a read-only root filesystem. Default is false. @@ -1333,8 +1347,8 @@ spec: mountPropagation: description: mountPropagation determines how mounts are propagated from the host to container and the other way - around. When not set, MountPropagationHostToContainer - is used. This field is beta in 1.10. + around. When not set, MountPropagationNone is used. This + field is beta in 1.10. type: string name: description: This must match the Name of a Volume. @@ -1569,11 +1583,12 @@ spec: the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of - available objects. Continuing a list may not be possible - if the server configuration has changed or more than - a few minutes have passed. The resourceVersion field - returned when using this continue value will be identical - to the value in the first response. + available objects. Continuing a consistent list may + not be possible if the server configuration has changed + or more than a few minutes have passed. The resourceVersion + field returned when using this continue value will + be identical to the value in the first response, unless + you have received this token from an error message. type: string resourceVersion: description: 'String that identifies the server''s internal @@ -2093,12 +2108,14 @@ spec: available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next - set of available objects. Continuing a list - may not be possible if the server configuration + set of available objects. Continuing a consistent + list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical - to the value in the first response. + to the value in the first response, unless + you have received this token from an error + message. type: string resourceVersion: description: 'String that identifies the server''s @@ -2216,6 +2233,26 @@ spec: items: type: string type: array + dataSource: + description: TypedLocalObjectReference contains enough information + to let you locate the typed referenced object inside the + same namespace. + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, the + specified Kind must be in the core API group. For + any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name resources: description: ResourceRequirements describes the compute resource requirements. diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 0117d343..58265136 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -41,6 +41,21 @@ spec: type: boolean required: - key + additionalAlertRelabelConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key additionalScrapeConfigs: description: SecretKeySelector selects a key of a Secret. properties: @@ -744,6 +759,13 @@ spec: baseImage: description: Base image to use for a Prometheus deployment. type: string + configMaps: + description: ConfigMaps is a list of ConfigMaps in the same namespace + as the Prometheus object, which shall be mounted into the Prometheus + Pods. The ConfigMaps are mounted into /etc/prometheus/configmaps/. + items: + type: string + type: array containers: description: Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod. @@ -1207,8 +1229,8 @@ spec: to by services. type: string protocol: - description: Protocol for port. Must be UDP or TCP. Defaults - to "TCP". + description: Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". type: string required: - containerPort @@ -1361,6 +1383,13 @@ spec: privileged containers are essentially equivalent to root on the host. Defaults to false. type: boolean + procMount: + description: procMount denotes the type of proc mount to use + for the containers. The default is DefaultProcMount which + uses the container runtime defaults for readonly paths and + masked paths. This requires the ProcMountType feature flag + to be enabled. + type: string readOnlyRootFilesystem: description: Whether this container has a read-only root filesystem. Default is false. @@ -1483,8 +1512,8 @@ spec: mountPropagation: description: mountPropagation determines how mounts are propagated from the host to container and the other way - around. When not set, MountPropagationHostToContainer - is used. This field is beta in 1.10. + around. When not set, MountPropagationNone is used. This + field is beta in 1.10. type: string name: description: This must match the Name of a Volume. @@ -1725,11 +1754,12 @@ spec: the server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of - available objects. Continuing a list may not be possible - if the server configuration has changed or more than - a few minutes have passed. The resourceVersion field - returned when using this continue value will be identical - to the value in the first response. + available objects. Continuing a consistent list may + not be possible if the server configuration has changed + or more than a few minutes have passed. The resourceVersion + field returned when using this continue value will + be identical to the value in the first response, unless + you have received this token from an error message. type: string resourceVersion: description: 'String that identifies the server''s internal @@ -2199,10 +2229,6 @@ spec: description: Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods. The Secrets are mounted into /etc/prometheus/secrets/. - Secrets changes after initial creation of a Prometheus object are - not reflected in the running Pods. To change the secrets mounted into - the Prometheus Pods, the object must be deleted and recreated with - the new list of secrets. items: type: string type: array @@ -2668,12 +2694,14 @@ spec: available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next - set of available objects. Continuing a list - may not be possible if the server configuration + set of available objects. Continuing a consistent + list may not be possible if the server configuration has changed or more than a few minutes have passed. The resourceVersion field returned when using this continue value will be identical - to the value in the first response. + to the value in the first response, unless + you have received this token from an error + message. type: string resourceVersion: description: 'String that identifies the server''s @@ -2791,6 +2819,26 @@ spec: items: type: string type: array + dataSource: + description: TypedLocalObjectReference contains enough information + to let you locate the typed referenced object inside the + same namespace. + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, the + specified Kind must be in the core API group. For + any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name resources: description: ResourceRequirements describes the compute resource requirements. diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index 52587192..877fadac 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -196,11 +196,12 @@ spec: server has more data available. The value is opaque and may be used to issue another request to the endpoint that served this list to retrieve the next set of available - objects. Continuing a list may not be possible if the - server configuration has changed or more than a few minutes - have passed. The resourceVersion field returned when using - this continue value will be identical to the value in - the first response. + objects. Continuing a consistent list may not be possible + if the server configuration has changed or more than a + few minutes have passed. The resourceVersion field returned + when using this continue value will be identical to the + value in the first response, unless you have received + this token from an error message. type: string resourceVersion: description: 'String that identifies the server''s internal diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 9c9c485a..a82bf6f3 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.24.0 - image: quay.io/coreos/prometheus-operator:v0.24.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.25.0 + image: quay.io/coreos/prometheus-operator:v0.25.0 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 0565661a..a049cf46 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -787,7 +787,8 @@ spec: - alert: CPUThrottlingHigh annotations: message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace - }} for {{ $labels.container_name }}.' + }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name + }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total[5m])) From 362e8249366133d01940bd556c81557a411f74ed Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 25 Oct 2018 17:03:25 +0200 Subject: [PATCH 453/638] kube-prometheus: Update kube-rbac-proxy and kube-state-metrics --- .../kube-state-metrics/kube-state-metrics.libsonnet | 4 ++-- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 2152c65f..167e4632 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -16,8 +16,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, versions+:: { - kubeStateMetrics: 'v1.3.1', - kubeRbacProxy: 'v0.3.1', + kubeStateMetrics: 'v1.4.0', + kubeRbacProxy: 'v0.4.0', addonResizer: '1.0', }, diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 7d8aadd8..8ac3d73e 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { nodeExporter: 'v0.16.0', - kubeRbacProxy: 'v0.3.1', + kubeRbacProxy: 'v0.4.0', }, imageRepos+:: { From 95dd9a95b85cca9a7060b26ae800f3acdbbdd7c1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 26 Oct 2018 14:27:25 +0200 Subject: [PATCH 454/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 2 +- manifests/kube-state-metrics-deployment.yaml | 6 +++--- manifests/node-exporter-daemonset.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index a62c4f16..ca87eab1 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "72cd517798083622b2d31943e3f4e38b6b1a204f" + "version": "a2cea4ac87d32686a2d5ab189f4e694297cbc305" }, { "name": "ksonnet", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 065c87a9..483b7c6d 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --secure-listen-address=:8443 - --upstream=http://127.0.0.1:8081/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.1 + image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy-main ports: - containerPort: 8443 @@ -34,7 +34,7 @@ spec: - args: - --secure-listen-address=:9443 - --upstream=http://127.0.0.1:8082/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.1 + image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy-self ports: - containerPort: 9443 @@ -51,7 +51,7 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.3.1 + image: quay.io/coreos/kube-state-metrics:v1.4.0 name: kube-state-metrics resources: limits: diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index b3febf8c..ee084f0a 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -44,7 +44,7 @@ spec: - args: - --secure-listen-address=:9100 - --upstream=http://127.0.0.1:9101/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.1 + image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy ports: - containerPort: 9100 From 49835437f784c28a6c34a5f69bc2b79907d9ff80 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 26 Oct 2018 16:39:39 +0200 Subject: [PATCH 455/638] Add AlertmanagerMembersInconsistent alerting rule --- .../alerts/alertmanager.libsonnet | 15 +++++++ jsonnet/kube-prometheus/alerts/tests.yaml | 44 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 jsonnet/kube-prometheus/alerts/tests.yaml diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index c8aba879..42ae3f98 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -30,6 +30,21 @@ severity: 'warning', }, }, + { + alert:'AlertmanagerMembersInconsistent', + annotations:{ + message: 'Alertmanager has not found all other members of the cluster.', + }, + expr: ||| + alertmanager_cluster_members{%(alertmanagerSelector)s} + != on (service) + count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s}) + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, ], }, ], diff --git a/jsonnet/kube-prometheus/alerts/tests.yaml b/jsonnet/kube-prometheus/alerts/tests.yaml new file mode 100644 index 00000000..8cfc3aa7 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/tests.yaml @@ -0,0 +1,44 @@ +# TODO(metalmatze): This file is temporarily saved here for later reference +# until we find out how to integrate the tests into our jsonnet stack. + +rule_files: + - rules.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' + values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0' + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' + values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' + values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' + alert_rule_test: + - eval_time: 5m + alertname: AlertmanagerMembersInconsistent + - eval_time: 11m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - eval_time: 17m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - eval_time: 23m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' From 29e876d02380b6c357a74dfccd607256b8dbe6c9 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 26 Oct 2018 17:26:50 +0200 Subject: [PATCH 456/638] contrib/kube-promeheus: Add AlertmanagerMembersInconsistent to manifests --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index ca87eab1..76ad6b09 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "a2cea4ac87d32686a2d5ab189f4e694297cbc305" + "version": "04235fdb35f150a46d5aeefd72c995bf864d2a2f" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index a049cf46..673dce1f 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -961,6 +961,16 @@ spec: for: 10m labels: severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: | + alertmanager_cluster_members{job="alertmanager-main"} + != on (service) + count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + for: 5m + labels: + severity: critical - name: general.rules rules: - alert: TargetDown From cfc4a9867bea974823b8bc6a0585afce6153b754 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Mon, 29 Oct 2018 09:26:20 +1300 Subject: [PATCH 457/638] Example jsonnet ingress, update docs with external url --- ...sing-prometheus-alertmanager-grafana-ingress.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index 34213067..68c5795a 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -24,7 +24,8 @@ htpasswd -c auth In order to use this a secret needs to be created containing the name of the `htpasswd`, and with annotations on the Ingress object basic auth can be configured. -[embedmd]:# (../examples/ingress.jsonnet) +Also, the applications provide external links to themselves in alerts and various places. When an ingress is used in front of the applications these links need to be based on the external URL's. This can be configured for each application in jsonnet. + ```jsonnet local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local secret = k.core.v1.secret; @@ -39,6 +40,13 @@ local kp = _config+:: { namespace: 'monitoring', }, + prometheus+:: { + prometheus+: { + spec+: { + externalURL: 'http://prometheus.example.com', + }, + }, + }, ingress+:: { 'prometheus-k8s': ingress.new() + @@ -73,7 +81,7 @@ k.core.v1.list.new([ ]) ``` -In order to expose Alertmanager and Grafana, simply create additional fields containing an ingress object, but simply pointing at the `alertmanager` or `grafana` instead of the `prometheus-k8s` Service. Make sure to also use the correct port respectively, for Alertmanager it is also `web`, for Grafana it is `http`. +In order to expose Alertmanager and Grafana, simply create additional fields containing an ingress object, but simply pointing at the `alertmanager` or `grafana` instead of the `prometheus-k8s` Service. Make sure to also use the correct port respectively, for Alertmanager it is also `web`, for Grafana it is `http`. Be sure to also specify the appropriate external URL. In order to render the ingress objects similar to the other objects use as demonstrated in the [main readme](../README.md#usage): @@ -89,3 +97,5 @@ In order to render the ingress objects similar to the other objects use as demon ``` Note, that in comparison only the last line was added, the rest is identical to the original. + +See (../examples/ingress.jsonnet) for an example implementation. From 5eec3fc643d2a52bc5ff4b7caee47163a7824f30 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Mon, 29 Oct 2018 22:00:35 +1300 Subject: [PATCH 458/638] Correct ingress jsonnet object naming to match projects general naming convention --- examples/ingress.jsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index cfc6dbbf..7b89094f 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -37,7 +37,7 @@ local kp = }, // Create ingress objects per application ingress+:: { - alertmanager: + 'alertmanager-main': ingress.new() + ingress.mixin.metadata.withName('alertmanager-main') + ingress.mixin.metadata.withNamespace($._config.namespace) + @@ -73,9 +73,9 @@ local kp = httpIngressPath.mixin.backend.withServicePort('http') ), ), - prometheus: + 'prometheus-k8s': ingress.new() + - ingress.mixin.metadata.withName('prometheus') + + ingress.mixin.metadata.withName('prometheus-k8s') + ingress.mixin.metadata.withNamespace($._config.namespace) + ingress.mixin.metadata.withAnnotations({ 'nginx.ingress.kubernetes.io/auth-type': 'basic', From 97391a24c1e97a0332eaa1f1e4dba87006e6705c Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Tue, 30 Oct 2018 06:50:00 +1300 Subject: [PATCH 459/638] kube-prometheus: docs - fix example link --- docs/exposing-prometheus-alertmanager-grafana-ingress.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index 68c5795a..9874dc20 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -98,4 +98,4 @@ In order to render the ingress objects similar to the other objects use as demon Note, that in comparison only the last line was added, the rest is identical to the original. -See (../examples/ingress.jsonnet) for an example implementation. +See [ingress.jsonnet](../examples/ingress.jsonnet) for an example implementation. From e53d3689b463b80663eb16885b990ff619bb7ac6 Mon Sep 17 00:00:00 2001 From: superbspeed Date: Thu, 1 Nov 2018 10:57:54 -0500 Subject: [PATCH 460/638] Correct file path for getting started guide. --- docs/kube-prometheus-on-kubeadm.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index bef2e3cc..d0101fea 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -7,7 +7,7 @@ The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. Kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with Kubeadm. -This guide assumes you have a basic understanding of how to use the functionality the Prometheus Operator implements. If you haven't yet, we recommend reading through the [getting started guide](getting-started.md) as well as the [alerting guide](../../../Documentation/user-guides/alerting.md). +This guide assumes you have a basic understanding of how to use the functionality the Prometheus Operator implements. If you haven't yet, we recommend reading through the [getting started guide](../../../Documentation/user-guides/getting-started.md) as well as the [alerting guide](../../../Documentation/user-guides/alerting.md). ## Kubeadm Pre-requisites From 305c28681bbeeca6cc1f5df0e9771501cc0bc90a Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Fri, 2 Nov 2018 13:52:38 +1100 Subject: [PATCH 461/638] adding an how to update section in kube-prometheus doc --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.md b/README.md index 9d5cc8c1..fdc799f5 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,14 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} This script runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml. +### Apply the manifests to setup Grafana and Prometheus +The previous steps (compilation) has created a bunch of manifest files in the manifest/ folder. +Now simply use kubectl to install Prometheus and Grafana as per your configuration: + +`kubectl apply -f manifests/` + +Check the monitoring namespace (or the namespace you have specific in `namespace: `) and make sure the pods are running. Prometheus and Grafana should be up and running soon. + ### Containerized Installing and Compiling If you don't care to have `jb` nor `jsonnet` nor `gojsontoyaml` installed, then build the `po-jsonnet` Docker image (this is something you'll need a copy of this repository for). Do the following from this `kube-prometheus` directory: @@ -212,6 +220,26 @@ docker run \ po-jsonnet ./build.sh example.jsonnet ``` +## Update from upstream project +You may wish to fetch changes made on this project so they are available to you. + +### Update jb +jb may have been updated so it's a good idea to get the latest version of this binary + +``` +go get -u github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` + +### Update from Prometheus Operator +The command below will sync with upstream project. +``` +jb update +``` + +### Compile the manifests and apply +Once updated, just follow the instructions under "Compiling" and "Apply the manifests to setup Grafana and Prometheus" to apply the changes to your cluster. + + ## Configuration Jsonnet has the concept of hidden fields. These are fields, that are not going to be rendered in a result. This is used to configure the kube-prometheus components in jsonnet. In the example jsonnet code of the above [Usage section](#Usage), you can see an example of this, where the `namespace` is being configured to be `monitoring`. In order to not override the whole object, use the `+::` construct of jsonnet, to merge objects, this way you can override individual settings, but retain all other settings and defaults. From cad5745106278ff01afd95ec8afbddfadd205273 Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Mon, 5 Nov 2018 09:52:11 +1100 Subject: [PATCH 462/638] update doc --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fdc799f5..f8401338 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} This script runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml. -### Apply the manifests to setup Grafana and Prometheus +### Apply the kube-prometheus stack The previous steps (compilation) has created a bunch of manifest files in the manifest/ folder. Now simply use kubectl to install Prometheus and Grafana as per your configuration: @@ -230,14 +230,14 @@ jb may have been updated so it's a good idea to get the latest version of this b go get -u github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb ``` -### Update from Prometheus Operator +### Update kube-prometheus The command below will sync with upstream project. ``` jb update ``` ### Compile the manifests and apply -Once updated, just follow the instructions under "Compiling" and "Apply the manifests to setup Grafana and Prometheus" to apply the changes to your cluster. +Once updated, just follow the instructions under "Compiling" and "Apply the kube-prometheus stack" to apply the changes to your cluster. ## Configuration From e42bff1ed3db7a7022f2eabf148ba19890ba4367 Mon Sep 17 00:00:00 2001 From: Alexandre Veyrenc Date: Mon, 5 Nov 2018 13:05:46 +0100 Subject: [PATCH 463/638] Add support for Kubespray clusters --- README.md | 10 +++++++++- examples/jsonnet-snippets/kubespray.jsonnet | 2 ++ .../kube-prometheus-kubespray.libsonnet | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 examples/jsonnet-snippets/kubespray.jsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet diff --git a/README.md b/README.md index f8401338..69d402f3 100644 --- a/README.md +++ b/README.md @@ -328,7 +328,7 @@ Jsonnet is a turing complete language, any logic can be reflected in it. It also ### Cluster Creation Tools -A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet) and [bootkube](examples/jsonnet-snippets/bootkube.jsonnet) and [kops](examples/jsonnet-snippets/kops.jsonnet) clusters there are mixins available to easily configure these: +A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet), [bootkube](examples/jsonnet-snippets/bootkube.jsonnet), [kops](examples/jsonnet-snippets/kops.jsonnet) and [kubespray](examples/jsonnet-snippets/kubespray.jsonnet) clusters there are mixins available to easily configure these: kubeadm: @@ -354,6 +354,14 @@ kops: (import 'kube-prometheus/kube-prometheus-kops.libsonnet') ``` +kubespray: + +[embedmd]:# (examples/jsonnet-snippets/kubespray.jsonnet) +```jsonnet +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') +``` + ### Internal Registry Some Kubernetes installations source all their images from an internal registry. kube-prometheus supports this use case and helps the user synchronize every image it uses to the internal registry and generate manifests pointing at the internal registry. diff --git a/examples/jsonnet-snippets/kubespray.jsonnet b/examples/jsonnet-snippets/kubespray.jsonnet new file mode 100644 index 00000000..1665cf72 --- /dev/null +++ b/examples/jsonnet-snippets/kubespray.jsonnet @@ -0,0 +1,2 @@ +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet new file mode 100644 index 00000000..8a69d215 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet @@ -0,0 +1,18 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + }, +} From 3e7084758104e61c1cb6ce81f640336bcc4bd1fa Mon Sep 17 00:00:00 2001 From: Alexandre Veyrenc Date: Mon, 5 Nov 2018 15:25:11 +0100 Subject: [PATCH 464/638] Upgrade packages for Kubespray support --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 76ad6b09..697e3606 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "04235fdb35f150a46d5aeefd72c995bf864d2a2f" + "version": "a9cb94d4f165f63e7b4094de1d087c5264667598" }, { "name": "ksonnet", From 681a18ef3aafb75ffb84946985b2e75294172bdc Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Tue, 6 Nov 2018 16:41:01 +0300 Subject: [PATCH 465/638] fix labels --- manifests/prometheus-rules.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 673dce1f..4770018c 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -688,7 +688,7 @@ spec: severity: warning - alert: KubeCronJobRunning annotations: - message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning expr: | @@ -698,7 +698,7 @@ spec: severity: warning - alert: KubeJobCompletion annotations: - message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion expr: | @@ -708,7 +708,7 @@ spec: severity: warning - alert: KubeJobFailed annotations: - message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed expr: | kube_job_status_failed{job="kube-state-metrics"} > 0 @@ -852,7 +852,7 @@ spec: severity: warning - alert: KubeClientErrors annotations: - message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + message: Kubernetes API server client '{{ $labels.job_name }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | @@ -865,7 +865,7 @@ spec: severity: warning - alert: KubeClientErrors annotations: - message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + message: Kubernetes API server client '{{ $labels.job_name }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | @@ -975,7 +975,7 @@ spec: rules: - alert: TargetDown annotations: - message: '{{ $value }}% of the {{ $labels.job }} targets are down.' + message: '{{ $value }}% of the {{ $labels.job_name }} targets are down.' expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: @@ -1060,7 +1060,7 @@ spec: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + description: '{{$labels.job_name}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | @@ -1070,7 +1070,7 @@ spec: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + description: '{{$labels.job_name}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | @@ -1080,7 +1080,7 @@ spec: severity: warning - alert: PrometheusTSDBWALCorruptions annotations: - description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + description: '{{$labels.job_name}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | From 3c6059c6a3b0c1e3b8f545b327885d9bd2e0ab17 Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Wed, 7 Nov 2018 12:31:00 +0300 Subject: [PATCH 466/638] revert unnecessary replacements of job_name --- manifests/prometheus-rules.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 4770018c..11a23307 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -852,7 +852,7 @@ spec: severity: warning - alert: KubeClientErrors annotations: - message: Kubernetes API server client '{{ $labels.job_name }}/{{ $labels.instance + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | @@ -865,7 +865,7 @@ spec: severity: warning - alert: KubeClientErrors annotations: - message: Kubernetes API server client '{{ $labels.job_name }}/{{ $labels.instance + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | @@ -975,7 +975,7 @@ spec: rules: - alert: TargetDown annotations: - message: '{{ $value }}% of the {{ $labels.job_name }} targets are down.' + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: @@ -1060,7 +1060,7 @@ spec: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: - description: '{{$labels.job_name}} at {{$labels.instance}} had {{$value | humanize}} + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | @@ -1070,7 +1070,7 @@ spec: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: - description: '{{$labels.job_name}} at {{$labels.instance}} had {{$value | humanize}} + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | @@ -1080,7 +1080,7 @@ spec: severity: warning - alert: PrometheusTSDBWALCorruptions annotations: - description: '{{$labels.job_name}} at {{$labels.instance}} has a corrupted write-ahead + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | From 0372a60d0c5b53449beb3baf62bfcec7d1a70682 Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Wed, 7 Nov 2018 12:40:50 +0300 Subject: [PATCH 467/638] jsonnet update --- jsonnetfile.lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 697e3606..74733929 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "a9cb94d4f165f63e7b4094de1d087c5264667598" + "version": "6e412f0b4727f3a7a6f097530294409baa6b520a" }, { "name": "ksonnet", @@ -18,7 +18,7 @@ "subdir": "" } }, - "version": "ed0796f3cb97ebc35ae54f543b1814a7c8dae305" + "version": "d03da231d6c8bd74437b74a1e9e8b966f13dffa2" }, { "name": "kubernetes-mixin", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "9d393239bd361c6ff9883f6d8c8e9bf0b1f1dd13" + "version": "1595151b85934d55ea6969a781039d66f82b22d5" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "bce2b3ae55983435f175045d59d0d5431570e120" + "version": "1ed195577cd8a406d4811dd6818e939169b686a7" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "282ae11a6f4fa47bf844a68f8a3eee9dd26a14be" + "version": "04e51ce1caeaa4c9aed4c446c9922388a13f6cb1" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "965ba5ca8bbf015b68abe3ad45c99270dc1022fb" + "version": "83304cfc808cf6303d48c45a696f169fae422e68" } ] } From 5a0835fa2667de454a696d36be24c07d4a4cf0eb Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Wed, 7 Nov 2018 12:59:40 +0300 Subject: [PATCH 468/638] bump rules --- manifests/grafana-dashboardDefinitions.yaml | 44 ++++++------- manifests/prometheus-rules.yaml | 72 ++++++++++----------- 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1f9a7a88..31f2ffcf 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1920,7 +1920,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu{mode=\"idle\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2172,7 +2172,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemFreeCachedBuffers:sum) / sum(:node_memory_MemTotal:sum)", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2256,7 +2256,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal:sum)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2340,7 +2340,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal:sum)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5011,7 +5011,7 @@ items: "show": true }, { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5064,7 +5064,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", + "expr": "avg by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cpu}}", @@ -5076,7 +5076,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "System load", + "title": "Usage Per Core", "tooltip": { "shared": true, "sort": 0, @@ -5168,7 +5168,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg (sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{ cpu }}", @@ -5276,7 +5276,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "avg(sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5352,28 +5352,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "max(node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "max(node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "max(node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5481,7 +5481,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5564,21 +5564,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "max(rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "max(rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5773,7 +5773,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5864,7 +5864,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5958,7 +5958,7 @@ items: "options": [ ], - "query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)", + "query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 11a23307..f0e668a1 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -122,17 +122,17 @@ spec: record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (node) (sum by (node, cpu) ( - node_cpu{job="node-exporter"} + node_cpu_seconds_total{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: )) record: node:node_num_cpu:sum - expr: | - 1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) + 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) record: :node_cpu_utilisation:avg1m - expr: | 1 - avg by (node) ( - rate(node_cpu{job="node-exporter",mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m @@ -152,26 +152,26 @@ spec: record: 'node:node_cpu_saturation_load1:' - expr: | 1 - - sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / - sum(node_memory_MemTotal{job="node-exporter"}) + sum(node_memory_MemTotal_bytes{job="node-exporter"}) record: ':node_memory_utilisation:' - expr: | - sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) - record: :node_memory_MemFreeCachedBuffers:sum + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum - expr: | - sum(node_memory_MemTotal{job="node-exporter"}) - record: :node_memory_MemTotal:sum + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum - expr: | sum by (node) ( - (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_memory_bytes_available:sum - expr: | sum by (node) ( - node_memory_MemTotal{job="node-exporter"} + node_memory_MemTotal_bytes{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -190,13 +190,13 @@ spec: - expr: | 1 - sum by (node) ( - (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) / sum by (node) ( - node_memory_MemTotal{job="node-exporter"} + node_memory_MemTotal_bytes{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -213,21 +213,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -241,25 +241,25 @@ spec: max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | - sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -688,8 +688,8 @@ spec: severity: warning - alert: KubeCronJobRunning annotations: - message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking - more than 1h to complete. + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning expr: | time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 @@ -698,8 +698,8 @@ spec: severity: warning - alert: KubeJobCompletion annotations: - message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than - one hour to complete. + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -739,7 +739,7 @@ spec: expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / - sum(node_memory_MemTotal) + sum(node_memory_MemTotal_bytes) > (count(node:node_num_cpu:sum)-1) / @@ -766,7 +766,7 @@ spec: expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) / - sum(node_memory_MemTotal{job="node-exporter"}) + sum(node_memory_MemTotal_bytes{job="node-exporter"}) > 1.5 for: 5m labels: @@ -801,7 +801,7 @@ spec: - alert: KubePersistentVolumeUsageCritical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | @@ -816,14 +816,14 @@ spec: annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four - days. Currently {{ $value }} bytes are available. + days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - ( - kubelet_volume_stats_used_bytes{job="kubelet"} + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} - ) > 0.85 + ) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m From 174e84f3e56735db07e93d091f5e62c84dc16145 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 7 Nov 2018 15:32:39 +0100 Subject: [PATCH 469/638] *: Update to Proemtheus v2.5.0 as default --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index d6cbc5fe..fafcd4d6 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.4.3', + prometheus: 'v2.5.0', }, imageRepos+:: { From 2ffb7cffeab7a25176992b46bc982da17e772cec Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 7 Nov 2018 16:20:23 +0100 Subject: [PATCH 470/638] *: Generate latest Documentation and manifest for Prom v2.5.0 --- examples/minikube.jsonnet | 12 +++++------ examples/prometheus-pvc.jsonnet | 31 ++++++++++++++-------------- jsonnetfile.lock.json | 2 +- manifests/prometheus-prometheus.yaml | 2 +- sync-to-internal-registry.jsonnet | 20 +++++++++--------- 5 files changed, 33 insertions(+), 34 deletions(-) diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index 850514fd..3073612a 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -10,10 +10,10 @@ local kp = config: importstr 'alertmanager-config.yaml', }, grafana+:: { - config: { // http://docs.grafana.org/installation/configuration/ + config: { // http://docs.grafana.org/installation/configuration/ sections: { // Do not require grafana users to login/authenticate - "auth.anonymous": {enabled: true}, + 'auth.anonymous': { enabled: true }, }, }, }, @@ -27,13 +27,13 @@ local kp = // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec spec+: { // An e.g. of the purpose of this is so the "Source" links on http:///#/alerts are valid. - externalUrl: "http://192.168.99.100:30900", + externalUrl: 'http://192.168.99.100:30900', // Reference info: "external_labels" on https://prometheus.io/docs/prometheus/latest/configuration/configuration/ externalLabels: { // This 'cluster' label will be included on every firing prometheus alert. (This is more useful // when running multiple clusters in a shared environment (e.g. AWS) with other users.) - cluster: "minikube-", + cluster: 'minikube-', }, }, }, @@ -42,9 +42,9 @@ local kp = alertmanager+: { // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec spec+: { - externalUrl: "http://192.168.99.100:30903", + externalUrl: 'http://192.168.99.100:30903', - logLevel: "debug", // So firing alerts show up in log + logLevel: 'debug', // So firing alerts show up in log }, }, }, diff --git a/examples/prometheus-pvc.jsonnet b/examples/prometheus-pvc.jsonnet index 75b250fe..82716e0f 100644 --- a/examples/prometheus-pvc.jsonnet +++ b/examples/prometheus-pvc.jsonnet @@ -1,10 +1,9 @@ - // Reference info: documentation for https://github.com/ksonnet/ksonnet-lib can be found at http://g.bryan.dev.hepti.center // -local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; // https://github.com/ksonnet/ksonnet-lib/blob/master/ksonnet.beta.3/k.libsonnet - imports k8s.libsonnet +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; // https://github.com/ksonnet/ksonnet-lib/blob/master/ksonnet.beta.3/k.libsonnet - imports k8s.libsonnet // * https://github.com/ksonnet/ksonnet-lib/blob/master/ksonnet.beta.3/k8s.libsonnet defines things such as "persistentVolumeClaim:: {" // -local pvc = k.core.v1.persistentVolumeClaim; // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#persistentvolumeclaim-v1-core (defines variable named 'spec' of type 'PersistentVolumeClaimSpec') +local pvc = k.core.v1.persistentVolumeClaim; // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#persistentvolumeclaim-v1-core (defines variable named 'spec' of type 'PersistentVolumeClaimSpec') local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + @@ -16,20 +15,20 @@ local kp = prometheus+:: { prometheus+: { - spec+: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + spec+: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec // If a value isn't specified for 'retention', then by default the '--storage.tsdb.retention=24h' arg will be passed to prometheus by prometheus-operator. // The possible values for a prometheus are: // * https://github.com/prometheus/common/blob/c7de230/model/time.go#L178 specifies "^([0-9]+)(y|w|d|h|m|s|ms)$" (years weeks days hours minutes seconds milliseconds) - retention: "30d", + retention: '30d', // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md // By default (if the following 'storage.volumeClaimTemplate' isn't created), prometheus will be created with an EmptyDir for the 'prometheus-k8s-db' volume (for the prom tsdb). // This 'storage.volumeClaimTemplate' causes the following to be automatically created (via dynamic provisioning) for each prometheus pod: // * PersistentVolumeClaim (and a corresponding PersistentVolume) // * the actual volume (per the StorageClassName specified below) - storage: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#storagespec - volumeClaimTemplate: // (same link as above where the 'pvc' variable is defined) - pvc.new() + // http://g.bryan.dev.hepti.center/core/v1/persistentVolumeClaim/#core.v1.persistentVolumeClaim.new + storage: { // https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#storagespec + volumeClaimTemplate: // (same link as above where the 'pvc' variable is defined) + pvc.new() + // http://g.bryan.dev.hepti.center/core/v1/persistentVolumeClaim/#core.v1.persistentVolumeClaim.new pvc.mixin.spec.withAccessModes('ReadWriteOnce') + @@ -40,14 +39,14 @@ local kp = // A StorageClass of the following name (which can be seen via `kubectl get storageclass` from a node in the given K8s cluster) must exist prior to kube-prometheus being deployed. pvc.mixin.spec.withStorageClassName('ssd'), - // The following 'selector' is only needed if you're using manual storage provisioning (https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md#manual-storage-provisioning). - // And note that this is not supported/allowed by AWS - uncommenting the following 'selector' line (when deploying kube-prometheus to a K8s cluster in AWS) will cause the pvc to be stuck in the Pending status and have the following error: - // * 'Failed to provision volume with StorageClass "ssd": claim.Spec.Selector is not supported for dynamic provisioning on AWS' - //pvc.mixin.spec.selector.withMatchLabels({}), - }, // storage - }, // spec - }, // prometheus - }, // prometheus + // The following 'selector' is only needed if you're using manual storage provisioning (https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md#manual-storage-provisioning). + // And note that this is not supported/allowed by AWS - uncommenting the following 'selector' line (when deploying kube-prometheus to a K8s cluster in AWS) will cause the pvc to be stuck in the Pending status and have the following error: + // * 'Failed to provision volume with StorageClass "ssd": claim.Spec.Selector is not supported for dynamic provisioning on AWS' + //pvc.mixin.spec.selector.withMatchLabels({}), + }, // storage + }, // spec + }, // prometheus + }, // prometheus }; diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 74733929..195e58e0 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "6e412f0b4727f3a7a6f097530294409baa6b520a" + "version": "fa0a0ae33a16a23845da8ab9973dd4eed50a20df" }, { "name": "ksonnet", diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index ae18cd67..94fd64dc 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -25,4 +25,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.4.3 + version: v2.5.0 diff --git a/sync-to-internal-registry.jsonnet b/sync-to-internal-registry.jsonnet index f0cf35ae..b7c85571 100644 --- a/sync-to-internal-registry.jsonnet +++ b/sync-to-internal-registry.jsonnet @@ -3,20 +3,20 @@ local l = import 'kube-prometheus/lib/lib.libsonnet'; local config = kp._config; local makeImages(config) = [ - { - name: config.imageRepos[image], - tag: config.versions[image], - } - for image in std.objectFields(config.imageRepos) + { + name: config.imageRepos[image], + tag: config.versions[image], + } + for image in std.objectFields(config.imageRepos) ]; local upstreamImage(image) = '%s:%s' % [image.name, image.tag]; local downstreamImage(registry, image) = '%s/%s:%s' % [registry, l.imageName(image.name), image.tag]; local pullPush(image, newRegistry) = [ - 'docker pull %s' % upstreamImage(image), - 'docker tag %s %s' % [upstreamImage(image), downstreamImage(newRegistry, image)], - 'docker push %s' % downstreamImage(newRegistry, image), + 'docker pull %s' % upstreamImage(image), + 'docker tag %s %s' % [upstreamImage(image), downstreamImage(newRegistry, image)], + 'docker push %s' % downstreamImage(newRegistry, image), ]; local images = makeImages(config); @@ -26,5 +26,5 @@ local output(repository) = std.flattenArrays([ for image in images ]); -function(repository="my-registry.com/repository") - std.join('\n', output(repository)) +function(repository='my-registry.com/repository') + std.join('\n', output(repository)) From 601fea2e9a9d132ae25a5e6df5bce752b00624de Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 1 Nov 2018 18:59:22 +0100 Subject: [PATCH 471/638] contrib/kube-prometheus: Create prometheus-adapter component contrib/kube-prometheus: Add section on Prometheus Adapter requirements to README.md --- README.md | 6 +- ...prometheus-rules-and-grafana-dashboards.md | 1 + example.jsonnet | 1 + .../kube-prometheus/kube-prometheus.libsonnet | 1 + .../prometheus-adapter.libsonnet | 198 ++++++++++++++++++ jsonnetfile.lock.json | 2 +- manifests/prometheus-adapter-apiService.yaml | 13 ++ manifests/prometheus-adapter-clusterRole.yaml | 16 ++ ...prometheus-adapter-clusterRoleBinding.yaml | 13 ++ ...s-adapter-clusterRoleBindingDelegator.yaml | 12 ++ ...us-adapter-clusterRoleServerResources.yaml | 11 + manifests/prometheus-adapter-configMap.yaml | 33 +++ manifests/prometheus-adapter-deployment.yaml | 41 ++++ ...metheus-adapter-roleBindingAuthReader.yaml | 13 ++ manifests/prometheus-adapter-service.yaml | 14 ++ .../prometheus-adapter-serviceAccount.yaml | 5 + 16 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet create mode 100644 manifests/prometheus-adapter-apiService.yaml create mode 100644 manifests/prometheus-adapter-clusterRole.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleBinding.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleServerResources.yaml create mode 100644 manifests/prometheus-adapter-configMap.yaml create mode 100644 manifests/prometheus-adapter-deployment.yaml create mode 100644 manifests/prometheus-adapter-roleBindingAuthReader.yaml create mode 100644 manifests/prometheus-adapter-service.yaml create mode 100644 manifests/prometheus-adapter-serviceAccount.yaml diff --git a/README.md b/README.md index 69d402f3..b035aceb 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,16 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m ## Prerequisites -You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control. +You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authentication and authorization, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authentication and authorization allows more fine grained and easier access control. This means the kubelet configuration must contain these flags: * `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s). * `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allow to access a resource, in specific for this project the `/metrics` endpoint. +This stack provides [resource metrics](https://github.com/kubernetes/metrics#resource-metrics-api) by deploying the [Prometheus Adapter](https://github.com/DirectXMan12/k8s-prometheus-adapter/). +This adapter is an Extension API Server and Kubernetes needs to be have this feature enabled, otherwise the adapter has no effect, but is still deployed. + ### minikube In order to just try out this stack, start minikube with the following command: @@ -155,6 +158,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index 1eb4f15a..72deb0e3 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -22,6 +22,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` diff --git a/example.jsonnet b/example.jsonnet index 1d36eb1f..2a10509c 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -10,4 +10,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 4402ca96..2dd32b50 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -7,6 +7,7 @@ local configMapList = k.core.v1.configMapList; (import 'alertmanager/alertmanager.libsonnet') + (import 'prometheus-operator/prometheus-operator.libsonnet') + (import 'prometheus/prometheus.libsonnet') + +(import 'prometheus-adapter/prometheus-adapter.libsonnet') + (import 'kubernetes-mixin/mixin.libsonnet') + (import 'alerts/alerts.libsonnet') + (import 'rules/rules.libsonnet') + { diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet new file mode 100644 index 00000000..177fb197 --- /dev/null +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -0,0 +1,198 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +{ + _config+:: { + namespace: 'default', + + versions+:: { + prometheusAdapter: 'v0.3.0', + }, + + imageRepos+:: { + prometheusAdapter: 'quay.io/coreos/k8s-prometheus-adapter-amd64', + }, + + prometheusAdapter+:: { + name: 'prometheus-adapter', + labels: { name: $._config.prometheusAdapter.name }, + config: ||| + resourceRules: + cpu: + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + memory: + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + window: 1m + |||, + }, + }, + + prometheusAdapter+:: { + apiService: + { + apiVersion: 'apiregistration.k8s.io/v1beta1', + kind: 'APIService', + metadata: { + name: 'v1beta1.metrics.k8s.io', + }, + spec: { + service: { + name: $.prometheusAdapter.service.metadata.name, + namespace: $._config.namespace, + }, + group: 'metrics.k8s.io', + version: 'v1beta1', + insecureSkipTLSVerify: true, + groupPriorityMinimum: 100, + versionPriority: 100, + }, + }, + + configMap: + local configmap = k.core.v1.configMap; + + configmap.new('adapter-config', { 'config.yaml': $._config.prometheusAdapter.config }) + + configmap.mixin.metadata.withNamespace($._config.namespace), + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + service.new( + $._config.prometheusAdapter.name, + $._config.prometheusAdapter.labels, + servicePort.newNamed('https', 443, 6443), + ) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels($._config.prometheusAdapter.labels), + + deployment: + local deployment = k.apps.v1beta2.deployment; + local volume = deployment.mixin.spec.template.spec.volumesType; + local container = deployment.mixin.spec.template.spec.containersType; + local containerVolumeMount = container.volumeMountsType; + + local c = + container.new($._config.prometheusAdapter.name, $._config.imageRepos.prometheusAdapter + ':' + $._config.versions.prometheusAdapter) + + container.withArgs([ + '--cert-dir=/var/run/serving-cert', + '--config=/etc/adapter/config.yaml', + '--logtostderr=true', + '--metrics-relist-interval=1m', + '--prometheus-url=http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc:9090/', + '--secure-port=6443', + ]) + + container.withPorts([{ containerPort: 6443 }]) + + container.withVolumeMounts([ + containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'), + containerVolumeMount.new('config', '/etc/adapter'), + ],); + + deployment.new($._config.prometheusAdapter.name, 1, c, $._config.prometheusAdapter.labels) + + deployment.mixin.metadata.withNamespace($._config.namespace) + + deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + + deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + + deployment.mixin.spec.template.spec.withVolumes([ + // volume.fromSecret('volume-serving-cert', 'cm-adapter-serving-certs'), + volume.fromEmptyDir(name='volume-serving-cert'), + { name: 'config', configMap: { name: 'adapter-config' } }, + ]), + + serviceAccount: + local serviceAccount = k.core.v1.serviceAccount; + + serviceAccount.new($._config.prometheusAdapter.name) + + serviceAccount.mixin.metadata.withNamespace($._config.namespace), + + clusterRole: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local rules = + policyRule.new() + + policyRule.withApiGroups(['']) + + policyRule.withResources(['nodes', 'namespaces', 'pods', 'services']) + + policyRule.withVerbs(['get', 'list', 'watch']); + + clusterRole.new() + + clusterRole.mixin.metadata.withName($._config.prometheusAdapter.name) + + clusterRole.withRules(rules), + + clusterRoleBinding: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) + + clusterRoleBinding.mixin.metadata.withNamespace($._config.namespace) + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ + kind: 'ServiceAccount', + name: $.prometheusAdapter.serviceAccount.metadata.name, + namespace: $._config.namespace, + }]), + + clusterRoleBindingDelegator: + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; + + clusterRoleBinding.new() + + clusterRoleBinding.mixin.metadata.withName('resource-metrics:system:auth-delegator') + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withName('system:auth-delegator') + + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + + clusterRoleBinding.withSubjects([{ + kind: 'ServiceAccount', + name: $.prometheusAdapter.serviceAccount.metadata.name, + namespace: $._config.namespace, + }]), + + clusterRoleServerResources: + local clusterRole = k.rbac.v1.clusterRole; + local policyRule = clusterRole.rulesType; + + local rules = + policyRule.new() + + policyRule.withApiGroups(['metrics.k8s.io']) + + policyRule.withResources(['*']) + + policyRule.withVerbs(['*']); + + clusterRole.new() + + clusterRole.mixin.metadata.withName('resource-metrics-server-resources') + + clusterRole.withRules(rules), + + roleBindingAuthReader: + local roleBinding = k.rbac.v1.roleBinding; + + roleBinding.new() + + roleBinding.mixin.metadata.withName('resource-metrics-auth-reader') + + roleBinding.mixin.metadata.withNamespace('kube-system') + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withName('extension-apiserver-authentication-reader') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + roleBinding.withSubjects([{ + kind: 'ServiceAccount', + name: $.prometheusAdapter.serviceAccount.metadata.name, + namespace: $._config.namespace, + }]), + }, +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 195e58e0..e874fc68 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "fa0a0ae33a16a23845da8ab9973dd4eed50a20df" + "version": "d00e8996976492005174b6412a9194421548b247" }, { "name": "ksonnet", diff --git a/manifests/prometheus-adapter-apiService.yaml b/manifests/prometheus-adapter-apiService.yaml new file mode 100644 index 00000000..95d5c32d --- /dev/null +++ b/manifests/prometheus-adapter-apiService.yaml @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io +spec: + group: metrics.k8s.io + groupPriorityMinimum: 100 + insecureSkipTLSVerify: true + service: + name: prometheus-adapter + namespace: monitoring + version: v1beta1 + versionPriority: 100 diff --git a/manifests/prometheus-adapter-clusterRole.yaml b/manifests/prometheus-adapter-clusterRole.yaml new file mode 100644 index 00000000..a02d2bb0 --- /dev/null +++ b/manifests/prometheus-adapter-clusterRole.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-adapter +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + - pods + - services + verbs: + - get + - list + - watch diff --git a/manifests/prometheus-adapter-clusterRoleBinding.yaml b/manifests/prometheus-adapter-clusterRoleBinding.yaml new file mode 100644 index 00000000..29fa9176 --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleBinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-adapter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-adapter +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml new file mode 100644 index 00000000..4295b50f --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: resource-metrics:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-clusterRoleServerResources.yaml b/manifests/prometheus-adapter-clusterRoleServerResources.yaml new file mode 100644 index 00000000..fcb914c3 --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleServerResources.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: resource-metrics-server-resources +rules: +- apiGroups: + - metrics.k8s.io + resources: + - '*' + verbs: + - '*' diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml new file mode 100644 index 00000000..a231de36 --- /dev/null +++ b/manifests/prometheus-adapter-configMap.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +data: + config.yaml: | + resourceRules: + cpu: + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + memory: + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + window: 1m +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml new file mode 100644 index 00000000..63360499 --- /dev/null +++ b/manifests/prometheus-adapter-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + name: prometheus-adapter + template: + metadata: + labels: + name: prometheus-adapter + spec: + containers: + - args: + - --cert-dir=/var/run/serving-cert + - --config=/etc/adapter/config.yaml + - --logtostderr=true + - --metrics-relist-interval=1m + - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ + - --secure-port=6443 + image: directxman12/k8s-prometheus-adapter-amd64:latest + name: prometheus-adapter + ports: + - containerPort: 6443 + volumeMounts: + - mountPath: /var/run/serving-cert + name: volume-serving-cert + readOnly: false + - mountPath: /etc/adapter + name: config + readOnly: false + serviceAccountName: prometheus-adapter + volumes: + - emptyDir: {} + name: volume-serving-cert + - configMap: + name: adapter-config + name: config diff --git a/manifests/prometheus-adapter-roleBindingAuthReader.yaml b/manifests/prometheus-adapter-roleBindingAuthReader.yaml new file mode 100644 index 00000000..48c8f325 --- /dev/null +++ b/manifests/prometheus-adapter-roleBindingAuthReader.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: resource-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-service.yaml b/manifests/prometheus-adapter-service.yaml new file mode 100644 index 00000000..e786e01c --- /dev/null +++ b/manifests/prometheus-adapter-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + name: prometheus-adapter + name: prometheus-adapter + namespace: monitoring +spec: + ports: + - name: https + port: 443 + targetPort: 6443 + selector: + name: prometheus-adapter diff --git a/manifests/prometheus-adapter-serviceAccount.yaml b/manifests/prometheus-adapter-serviceAccount.yaml new file mode 100644 index 00000000..d7e70503 --- /dev/null +++ b/manifests/prometheus-adapter-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-adapter + namespace: monitoring From 7f77f83baba6e8512a800f3e9a0d2b1562ae9ccb Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 9 Nov 2018 15:09:20 +0100 Subject: [PATCH 472/638] contrib/kube-prometheus: Update Prometheus Adapter image --- jsonnetfile.lock.json | 2 +- manifests/prometheus-adapter-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e874fc68..acac2b15 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "d00e8996976492005174b6412a9194421548b247" + "version": "556153e077ed61c4567ae1aa920903d2c7920c23" }, { "name": "ksonnet", diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index 63360499..f7caa2d2 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -21,7 +21,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter-amd64:latest + image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.3.0 name: prometheus-adapter ports: - containerPort: 6443 From b7273bf567b2ddcaf34b238165b2e07b2ad3210a Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 12 Nov 2018 11:28:37 +0100 Subject: [PATCH 473/638] contrib/kube-prometheues: Mount emptyDir as tmpfs to /tmp in adapter --- .../prometheus-adapter/prometheus-adapter.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 177fb197..16189d39 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -103,6 +103,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; ]) + container.withPorts([{ containerPort: 6443 }]) + container.withVolumeMounts([ + containerVolumeMount.new('tmpfs', '/tmp'), containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'), containerVolumeMount.new('config', '/etc/adapter'), ],); @@ -112,7 +113,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + deployment.mixin.spec.template.spec.withVolumes([ - // volume.fromSecret('volume-serving-cert', 'cm-adapter-serving-certs'), + volume.fromEmptyDir(name='tmpfs'), volume.fromEmptyDir(name='volume-serving-cert'), { name: 'config', configMap: { name: 'adapter-config' } }, ]), From cc451840f37fae5c19c5733c7fd9ec685485d2b0 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 12 Nov 2018 11:34:27 +0100 Subject: [PATCH 474/638] contrib/kube-prometheus: Update manifests with /tmp emptyDir for adapter --- jsonnetfile.lock.json | 2 +- manifests/prometheus-adapter-deployment.yaml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index acac2b15..2dace0bd 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "556153e077ed61c4567ae1aa920903d2c7920c23" + "version": "5185231304f688cf127bf235a4dfdf9f4f9e7821" }, { "name": "ksonnet", diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index f7caa2d2..9d28503c 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -26,6 +26,9 @@ spec: ports: - containerPort: 6443 volumeMounts: + - mountPath: /tmp + name: tmpfs + readOnly: false - mountPath: /var/run/serving-cert name: volume-serving-cert readOnly: false @@ -34,6 +37,8 @@ spec: readOnly: false serviceAccountName: prometheus-adapter volumes: + - emptyDir: {} + name: tmpfs - emptyDir: {} name: volume-serving-cert - configMap: From 1b8684083ce46f158f7ca3f088ee11e18a737c03 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 13 Nov 2018 16:15:05 +0100 Subject: [PATCH 475/638] *: Update to Alertmanager v0.15.3 --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index a6d9e8e6..e109b0ad 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.15.2', + alertmanager: 'v0.15.3', }, imageRepos+:: { From cea2da78e44f16db6c928b676b30a87613b35030 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 13 Nov 2018 16:21:02 +0100 Subject: [PATCH 476/638] contrib/kube-prometheus: Generate --- README.md | 8 +++--- jsonnetfile.lock.json | 10 ++++---- manifests/alertmanager-alertmanager.yaml | 2 +- manifests/grafana-dashboardDefinitions.yaml | 27 ++++++++++++++------- manifests/prometheus-rules.yaml | 22 ++++++++++++----- 5 files changed, 44 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index b035aceb..f6fc9860 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.15.2", + alertmanager: "v0.15.3", nodeExporter: "v0.16.0", kubeStateMetrics: "v1.3.1", kubeRbacProxy: "v0.3.1", @@ -377,9 +377,9 @@ $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization s docker pull quay.io/coreos/addon-resizer:1.0 docker tag quay.io/coreos/addon-resizer:1.0 internal-registry.com/organization/addon-resizer:1.0 docker push internal-registry.com/organization/addon-resizer:1.0 -docker pull quay.io/prometheus/alertmanager:v0.15.2 -docker tag quay.io/prometheus/alertmanager:v0.15.2 internal-registry.com/organization/alertmanager:v0.15.2 -docker push internal-registry.com/organization/alertmanager:v0.15.2 +docker pull quay.io/prometheus/alertmanager:v0.15.3 +docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 +docker push internal-registry.com/organization/alertmanager:v0.15.3 ... ``` diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 2dace0bd..8c6ef28c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "5185231304f688cf127bf235a4dfdf9f4f9e7821" + "version": "c9350aab06b47bcf8410b597ba50b4addf21ee3d" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "1595151b85934d55ea6969a781039d66f82b22d5" + "version": "f7ca48cca5d9cadc9a2203b8c0b3bb3eb85f3294" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "1ed195577cd8a406d4811dd6818e939169b686a7" + "version": "d407225c5a2e087eb68843528aab2be0507c73b8" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "04e51ce1caeaa4c9aed4c446c9922388a13f6cb1" + "version": "90fbdbf08cf0d4bdc78ab52151041da36a7b0abc" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "83304cfc808cf6303d48c45a696f169fae422e68" + "version": "ee9dcbca0d89dc563c9e6bc725fab0c6f21d689b" } ] } diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index e800beac..2230ea9e 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -11,4 +11,4 @@ spec: beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main - version: v0.15.2 + version: v0.15.3 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 31f2ffcf..e4364aa6 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5279,7 +5279,8 @@ items: "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "80, 90", @@ -5484,7 +5485,8 @@ items: "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "80, 90", @@ -6580,7 +6582,8 @@ items: "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6659,7 +6662,8 @@ items: "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6738,7 +6742,8 @@ items: "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6832,7 +6837,8 @@ items: "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6912,7 +6918,8 @@ items: "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6992,7 +6999,8 @@ items: "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -7072,7 +7080,8 @@ items: "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index f0e668a1..6a5df8a3 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -223,22 +223,22 @@ spec: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate - expr: | - max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_usage:' - expr: | - max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + @@ -829,6 +829,16 @@ spec: for: 5m labels: severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical - name: kubernetes-system rules: - alert: KubeNodeNotReady From a24090932613f52b0dc40370a590679e6693b8a1 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 13 Nov 2018 14:43:30 +0100 Subject: [PATCH 477/638] contrib/kube-prometheus: Update custom-metrics example to use our adapter --- ...r-auth-delegator-cluster-role-binding.yaml | 12 --- ...cs-apiserver-auth-reader-role-binding.yaml | 13 --- .../custom-metrics-apiserver-deployment.yaml | 41 -------- ...-resource-reader-cluster-role-binding.yaml | 6 +- ...tom-metrics-apiserver-service-account.yaml | 4 - .../custom-metrics-apiserver-service.yaml | 10 -- .../custom-metrics-apiservice.yaml | 2 +- .../custom-metrics-configmap.yaml | 98 +++++++++++++++++++ ...-metrics-resource-reader-cluster-role.yaml | 14 --- experimental/custom-metrics-api/deploy.sh | 13 --- experimental/custom-metrics-api/gencerts.sh | 37 ------- experimental/custom-metrics-api/teardown.sh | 13 --- 12 files changed, 102 insertions(+), 161 deletions(-) delete mode 100644 experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml delete mode 100644 experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml delete mode 100644 experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml delete mode 100644 experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml delete mode 100644 experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml create mode 100644 experimental/custom-metrics-api/custom-metrics-configmap.yaml delete mode 100644 experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml delete mode 100755 experimental/custom-metrics-api/deploy.sh delete mode 100755 experimental/custom-metrics-api/gencerts.sh delete mode 100755 experimental/custom-metrics-api/teardown.sh diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml deleted file mode 100644 index 8853bc1f..00000000 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: custom-metrics:system:auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator -subjects: -- kind: ServiceAccount - name: custom-metrics-apiserver - namespace: monitoring diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml deleted file mode 100644 index 682143cf..00000000 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-auth-reader-role-binding.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: RoleBinding -metadata: - name: custom-metrics-auth-reader - namespace: kube-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: extension-apiserver-authentication-reader -subjects: -- kind: ServiceAccount - name: custom-metrics-apiserver - namespace: monitoring diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml deleted file mode 100644 index e5b4beea..00000000 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-deployment.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - labels: - app: custom-metrics-apiserver - name: custom-metrics-apiserver -spec: - replicas: 1 - selector: - matchLabels: - app: custom-metrics-apiserver - template: - metadata: - labels: - app: custom-metrics-apiserver - name: custom-metrics-apiserver - spec: - serviceAccountName: custom-metrics-apiserver - containers: - - name: custom-metrics-apiserver - image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.2.0 - args: - - /adapter - - --secure-port=6443 - - --tls-cert-file=/var/run/serving-cert/serving.crt - - --tls-private-key-file=/var/run/serving-cert/serving.key - - --logtostderr=true - - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - - --metrics-relist-interval=30s - - --rate-interval=5m - - --v=10 - ports: - - containerPort: 6443 - volumeMounts: - - mountPath: /var/run/serving-cert - name: volume-serving-cert - readOnly: true - volumes: - - name: volume-serving-cert - secret: - secretName: cm-adapter-serving-certs diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml index 0335c177..e2b1ca43 100644 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +++ b/experimental/custom-metrics-api/custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: - name: custom-metrics-resource-reader + name: custom-metrics-server-resources roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: custom-metrics-resource-reader + name: custom-metrics-server-resources subjects: - kind: ServiceAccount - name: custom-metrics-apiserver + name: prometheus-adapter namespace: monitoring diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml deleted file mode 100644 index 29359409..00000000 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-service-account.yaml +++ /dev/null @@ -1,4 +0,0 @@ -kind: ServiceAccount -apiVersion: v1 -metadata: - name: custom-metrics-apiserver diff --git a/experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml b/experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml deleted file mode 100644 index fb0addcb..00000000 --- a/experimental/custom-metrics-api/custom-metrics-apiserver-service.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: custom-metrics-apiserver -spec: - ports: - - port: 443 - targetPort: 6443 - selector: - app: custom-metrics-apiserver diff --git a/experimental/custom-metrics-api/custom-metrics-apiservice.yaml b/experimental/custom-metrics-api/custom-metrics-apiservice.yaml index cfc2ee63..98f87495 100644 --- a/experimental/custom-metrics-api/custom-metrics-apiservice.yaml +++ b/experimental/custom-metrics-api/custom-metrics-apiservice.yaml @@ -4,7 +4,7 @@ metadata: name: v1beta1.custom.metrics.k8s.io spec: service: - name: custom-metrics-apiserver + name: prometheus-adapter namespace: monitoring group: custom.metrics.k8s.io version: v1beta1 diff --git a/experimental/custom-metrics-api/custom-metrics-configmap.yaml b/experimental/custom-metrics-api/custom-metrics-configmap.yaml new file mode 100644 index 00000000..2e209cc3 --- /dev/null +++ b/experimental/custom-metrics-api/custom-metrics-configmap.yaml @@ -0,0 +1,98 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring +data: + config.yaml: | + rules: + - seriesQuery: '{__name__=~"^container_.*",container_name!="POD",namespace!="",pod_name!=""}' + seriesFilters: [] + resources: + overrides: + namespace: + resource: namespace + pod_name: + resource: pod + name: + matches: ^container_(.*)_seconds_total$ + as: "" + metricsQuery: sum(rate(<<.Series>>{<<.LabelMatchers>>,container_name!="POD"}[1m])) by (<<.GroupBy>>) + - seriesQuery: '{__name__=~"^container_.*",container_name!="POD",namespace!="",pod_name!=""}' + seriesFilters: + - isNot: ^container_.*_seconds_total$ + resources: + overrides: + namespace: + resource: namespace + pod_name: + resource: pod + name: + matches: ^container_(.*)_total$ + as: "" + metricsQuery: sum(rate(<<.Series>>{<<.LabelMatchers>>,container_name!="POD"}[1m])) by (<<.GroupBy>>) + - seriesQuery: '{__name__=~"^container_.*",container_name!="POD",namespace!="",pod_name!=""}' + seriesFilters: + - isNot: ^container_.*_total$ + resources: + overrides: + namespace: + resource: namespace + pod_name: + resource: pod + name: + matches: ^container_(.*)$ + as: "" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>,container_name!="POD"}) by (<<.GroupBy>>) + - seriesQuery: '{namespace!="",__name__!~"^container_.*"}' + seriesFilters: + - isNot: .*_total$ + resources: + template: <<.Resource>> + name: + matches: "" + as: "" + metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>) + - seriesQuery: '{namespace!="",__name__!~"^container_.*"}' + seriesFilters: + - isNot: .*_seconds_total + resources: + template: <<.Resource>> + name: + matches: ^(.*)_total$ + as: "" + metricsQuery: sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + - seriesQuery: '{namespace!="",__name__!~"^container_.*"}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: ^(.*)_seconds_total$ + as: "" + metricsQuery: sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + resourceRules: + cpu: + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + memory: + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + window: 1m diff --git a/experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml b/experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml deleted file mode 100644 index a5ad7604..00000000 --- a/experimental/custom-metrics-api/custom-metrics-resource-reader-cluster-role.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: custom-metrics-resource-reader -rules: -- apiGroups: - - "" - resources: - - namespaces - - pods - - services - verbs: - - get - - list diff --git a/experimental/custom-metrics-api/deploy.sh b/experimental/custom-metrics-api/deploy.sh deleted file mode 100755 index 2255c7fd..00000000 --- a/experimental/custom-metrics-api/deploy.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -kubectl create -f custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml -kubectl create -f custom-metrics-apiserver-auth-reader-role-binding.yaml -kubectl -n monitoring create -f cm-adapter-serving-certs.yaml -kubectl -n monitoring create -f custom-metrics-apiserver-deployment.yaml -kubectl create -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml -kubectl -n monitoring create -f custom-metrics-apiserver-service-account.yaml -kubectl -n monitoring create -f custom-metrics-apiserver-service.yaml -kubectl create -f custom-metrics-apiservice.yaml -kubectl create -f custom-metrics-cluster-role.yaml -kubectl create -f custom-metrics-resource-reader-cluster-role.yaml -kubectl create -f hpa-custom-metrics-cluster-role-binding.yaml diff --git a/experimental/custom-metrics-api/gencerts.sh b/experimental/custom-metrics-api/gencerts.sh deleted file mode 100755 index a8f5539d..00000000 --- a/experimental/custom-metrics-api/gencerts.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -# Detect if we are on mac or should use GNU base64 options -case $(uname) in - Darwin) - b64_opts='-b=0' - ;; - *) - b64_opts='--wrap=0' -esac - -go get -v -u github.com/cloudflare/cfssl/cmd/... - -export PURPOSE=metrics -openssl req -x509 -sha256 -new -nodes -days 365 -newkey rsa:2048 -keyout ${PURPOSE}-ca.key -out ${PURPOSE}-ca.crt -subj "/CN=ca" -echo '{"signing":{"default":{"expiry":"43800h","usages":["signing","key encipherment","'${PURPOSE}'"]}}}' > "${PURPOSE}-ca-config.json" - -export SERVICE_NAME=custom-metrics-apiserver -export ALT_NAMES='"custom-metrics-apiserver.monitoring","custom-metrics-apiserver.monitoring.svc"' -echo "{\"CN\":\"${SERVICE_NAME}\", \"hosts\": [${ALT_NAMES}], \"key\": {\"algo\": \"rsa\",\"size\": 2048}}" | \ - cfssl gencert -ca=metrics-ca.crt -ca-key=metrics-ca.key -config=metrics-ca-config.json - | cfssljson -bare apiserver - -cat <<-EOF > cm-adapter-serving-certs.yaml -apiVersion: v1 -kind: Secret -metadata: - name: cm-adapter-serving-certs -data: - serving.crt: $(base64 ${b64_opts} < apiserver.pem) - serving.key: $(base64 ${b64_opts} < apiserver-key.pem) -EOF diff --git a/experimental/custom-metrics-api/teardown.sh b/experimental/custom-metrics-api/teardown.sh deleted file mode 100755 index 4797de1c..00000000 --- a/experimental/custom-metrics-api/teardown.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -kubectl delete -f custom-metrics-apiserver-auth-delegator-cluster-role-binding.yaml -kubectl delete -f custom-metrics-apiserver-auth-reader-role-binding.yaml -kubectl -n monitoring delete -f cm-adapter-serving-certs.yaml -kubectl -n monitoring delete -f custom-metrics-apiserver-deployment.yaml -kubectl delete -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml -kubectl -n monitoring delete -f custom-metrics-apiserver-service-account.yaml -kubectl -n monitoring delete -f custom-metrics-apiserver-service.yaml -kubectl delete -f custom-metrics-apiservice.yaml -kubectl delete -f custom-metrics-cluster-role.yaml -kubectl delete -f custom-metrics-resource-reader-cluster-role.yaml -kubectl delete -f hpa-custom-metrics-cluster-role-binding.yaml From d4089d7125bd765b2077b7a31589831b6f88006e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 13 Nov 2018 14:43:53 +0100 Subject: [PATCH 478/638] contrib/kube-prometheus: Add sample-app for showing custom metrics with HPA Use interval in ServiceMonitor endpoint Reintroduce updated dpeloy.sh & teardown.sh --- experimental/custom-metrics-api/deploy.sh | 7 ++ .../custom-metrics-api/sample-app.yaml | 67 +++++++++++++++++++ experimental/custom-metrics-api/teardown.sh | 7 ++ 3 files changed, 81 insertions(+) create mode 100644 experimental/custom-metrics-api/deploy.sh create mode 100644 experimental/custom-metrics-api/sample-app.yaml create mode 100644 experimental/custom-metrics-api/teardown.sh diff --git a/experimental/custom-metrics-api/deploy.sh b/experimental/custom-metrics-api/deploy.sh new file mode 100644 index 00000000..a7324831 --- /dev/null +++ b/experimental/custom-metrics-api/deploy.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +kubectl apply -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl apply -n monitoring custom-metrics-apiservice.yaml +kubectl apply -n monitoring custom-metrics-cluster-role.yaml +kubectl apply -n monitoring custom-metrics-configmap.yaml +kubectl apply -n monitoring hpa-custom-metrics-cluster-role-binding.yaml diff --git a/experimental/custom-metrics-api/sample-app.yaml b/experimental/custom-metrics-api/sample-app.yaml new file mode 100644 index 00000000..470887c6 --- /dev/null +++ b/experimental/custom-metrics-api/sample-app.yaml @@ -0,0 +1,67 @@ +kind: ServiceMonitor +apiVersion: monitoring.coreos.com/v1 +metadata: + name: sample-app + labels: + app: sample-app +spec: + selector: + matchLabels: + app: sample-app + endpoints: + - port: http + interval: 5s +--- +apiVersion: v1 +kind: Service +metadata: + name: sample-app + labels: + app: sample-app +spec: + ports: + - name: http + port: 8080 + targetPort: 8080 + selector: + app: sample-app +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sample-app + labels: + app: sample-app +spec: + replicas: 1 + selector: + matchLabels: + app: sample-app + template: + metadata: + labels: + app: sample-app + spec: + containers: + - image: luxas/autoscale-demo:v0.1.2 + name: metrics-provider + ports: + - name: http + containerPort: 8080 +--- +kind: HorizontalPodAutoscaler +apiVersion: autoscaling/v2beta1 +metadata: + name: sample-app +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: sample-app + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metricName: http_requests + targetAverageValue: 500m diff --git a/experimental/custom-metrics-api/teardown.sh b/experimental/custom-metrics-api/teardown.sh new file mode 100644 index 00000000..2287c799 --- /dev/null +++ b/experimental/custom-metrics-api/teardown.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +kubectl delete -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl delete -n monitoring custom-metrics-apiservice.yaml +kubectl delete -n monitoring custom-metrics-cluster-role.yaml +kubectl delete -n monitoring custom-metrics-configmap.yaml +kubectl delete -n monitoring hpa-custom-metrics-cluster-role-binding.yaml From 1b495e7bcb7713eb9b058dc9f98a0a6d283dd026 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 14 Nov 2018 16:19:05 +0100 Subject: [PATCH 479/638] contrib/kube-prometheus: Update README.md for custom-metrics-api --- experimental/custom-metrics-api/README.md | 18 ++++++++++++++---- experimental/custom-metrics-api/deploy.sh | 2 +- experimental/custom-metrics-api/teardown.sh | 2 +- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/experimental/custom-metrics-api/README.md b/experimental/custom-metrics-api/README.md index 91375a42..c5c7102c 100644 --- a/experimental/custom-metrics-api/README.md +++ b/experimental/custom-metrics-api/README.md @@ -1,11 +1,21 @@ # Custom Metrics API -The custom metrics API allows the HPA v2 to scale on arbirary metrics. +The custom metrics API allows the HPA v2 to scale based on arbirary metrics. -This directory contains an example deployment of the custom metrics API adapter using Prometheus as the backing monitoring system. +This directory contains an example deployment which extends the Prometheus Adapter, deployed with kube-prometheus, serve the [Custom Metrics API](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/instrumentation/custom-metrics-api.md) by talking to Prometheus running inside the cluster. -In order to deploy the custom metrics adapter for Prometheus you need to generate TLS certficates used to serve the API. An example of how these could be generated can be found in `./gencerts.sh`, note that this is _not_ recommended to be used in production. You need to employ a secure PKI strategy, this is merely an example to get started and try it out quickly. +Make sure you have the Prometheus Adapter up and running in the `monitoring` namespace. -Once the generated `Secret` with the certificates is in place, you can deploy everything in the `monitoring` namespace using `./deploy.sh`. +You can deploy everything in the `monitoring` namespace using `./deploy.sh`. When you're done, you can teardown using the `./teardown.sh` script. + +### Sample App + +Additionally, this directory contains a sample app that uses the [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) to scale the Deployment's replicas of Pods up and down as needed. +Deploy this app by running `kubectl apply -f sample-app.yaml`. +Make the app accessible on your system, for example by using `kubectl -n monitoring port-forward svc/sample-app 8080`. Next you need to put some load on its http endpoints. + +A tool like [hey](https://github.com/rakyll/hey) is helpful for doing so: `hey -c 20 -n 100000000 http://localhost:8080/metrics` + +There is an even more detailed information on this sample app at [luxas/kubeadm-workshop](https://github.com/luxas/kubeadm-workshop#deploying-the-prometheus-operator-for-monitoring-services-in-the-cluster). diff --git a/experimental/custom-metrics-api/deploy.sh b/experimental/custom-metrics-api/deploy.sh index a7324831..1ac74878 100644 --- a/experimental/custom-metrics-api/deploy.sh +++ b/experimental/custom-metrics-api/deploy.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash kubectl apply -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml kubectl apply -n monitoring custom-metrics-apiservice.yaml diff --git a/experimental/custom-metrics-api/teardown.sh b/experimental/custom-metrics-api/teardown.sh index 2287c799..a62f685e 100644 --- a/experimental/custom-metrics-api/teardown.sh +++ b/experimental/custom-metrics-api/teardown.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash kubectl delete -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml kubectl delete -n monitoring custom-metrics-apiservice.yaml From 49644189c2538cf434292f2238cc59805fc38d37 Mon Sep 17 00:00:00 2001 From: Jerome Froelich Date: Wed, 14 Nov 2018 12:41:33 -0500 Subject: [PATCH 480/638] kube-prometheus: Add documentaton to README on the Pod Anti-Affinity mixin Currently there is no documentation on the Pod Anti-Affinity mixin which makes it difficult to find. To address this, this PR adds a section to the README on how one can use it. --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f6fc9860..04c7eb86 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m * [node-exporter DaemonSet namespace](#node-exporter-daemonset-namespace) * [Alertmanager configuration](#alertmanager-configuration) * [Static etcd configuration](#static-etcd-configuration) + * [Pod Anti-Affinity](#pod-anti-affinity) * [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) * [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) * [Minikube Example](#minikube-example) @@ -225,7 +226,7 @@ docker run \ ``` ## Update from upstream project -You may wish to fetch changes made on this project so they are available to you. +You may wish to fetch changes made on this project so they are available to you. ### Update jb jb may have been updated so it's a good idea to get the latest version of this binary @@ -523,6 +524,16 @@ In order to configure a static etcd cluster to scrape there is a simple [kube-pr > Note that monitoring etcd in minikube is currently not possible because of how etcd is setup. (minikube's etcd binds to 127.0.0.1:2379 only, and within host networking namespace.) +### Pod Anti-Affinity + +To prevent `Prometheus` and `Alertmanager` instances from being deployed onto the same node when +possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet) mixin: + +```jsonnet +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') +``` + ### Customizing Prometheus alerting/recording rules and Grafana dashboards See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide. From b0c35b544751de746646b0ffd4c1e4276ac002d6 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 7 Nov 2018 18:34:57 +0100 Subject: [PATCH 481/638] contrib/kube-prometheus: Remove superseded compatibility node exporter rules --- ...rter-v0.16.0-compatibility-rules.libsonnet | 406 ------------------ jsonnet/kube-prometheus/rules/rules.libsonnet | 3 +- 2 files changed, 1 insertion(+), 408 deletions(-) delete mode 100644 jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet diff --git a/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet b/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet deleted file mode 100644 index f8e5d6d5..00000000 --- a/jsonnet/kube-prometheus/rules/node-exporter-v0.16.0-compatibility-rules.libsonnet +++ /dev/null @@ -1,406 +0,0 @@ -{ - prometheusRules+:: { - groups+: [ - { - name: 'node_exporter-16-bcache', - rules: [ - { - expr: 'node_bcache_cache_read_races', - record: 'node_bcache_cache_read_races_total', - }, - ], - }, - { - name: 'node_exporter-16-buddyinfo', - rules: [ - { - expr: 'node_buddyinfo_blocks', - record: 'node_buddyinfo_count', - }, - ], - }, - { - name: 'node_exporter-16-stat', - rules: [ - { - expr: 'node_boot_time_seconds', - record: 'node_boot_time', - }, - { - expr: 'node_context_switches_total', - record: 'node_context_switches', - }, - { - expr: 'node_forks_total', - record: 'node_forks', - }, - { - expr: 'node_intr_total', - record: 'node_intr', - }, - ], - }, - { - name: 'node_exporter-16-cpu', - rules: [ - { - expr: 'label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)")', - record: 'node_cpu', - }, - ], - }, - { - name: 'node_exporter-16-diskstats', - rules: [ - { - expr: 'node_disk_read_bytes_total', - record: 'node_disk_bytes_read', - }, - { - expr: 'node_disk_written_bytes_total', - record: 'node_disk_bytes_written', - }, - { - expr: 'node_disk_io_time_seconds_total * 1000', - record: 'node_disk_io_time_ms', - }, - { - expr: 'node_disk_io_time_weighted_seconds_total', - record: 'node_disk_io_time_weighted', - }, - { - expr: 'node_disk_reads_completed_total', - record: 'node_disk_reads_completed', - }, - { - expr: 'node_disk_reads_merged_total', - record: 'node_disk_reads_merged', - }, - { - expr: 'node_disk_read_time_seconds_total * 1000', - record: 'node_disk_read_time_ms', - }, - { - expr: 'node_disk_writes_completed_total', - record: 'node_disk_writes_completed', - }, - { - expr: 'node_disk_writes_merged_total', - record: 'node_disk_writes_merged', - }, - { - expr: 'node_disk_write_time_seconds_total * 1000', - record: 'node_disk_write_time_ms', - }, - ], - }, - { - name: 'node_exporter-16-filesystem', - rules: [ - { - expr: 'node_filesystem_free_bytes', - record: 'node_filesystem_free', - }, - { - expr: 'node_filesystem_avail_bytes', - record: 'node_filesystem_avail', - }, - { - expr: 'node_filesystem_size_bytes', - record: 'node_filesystem_size', - }, - ], - }, - { - name: 'node_exporter-16-infiniband', - rules: [ - { - expr: 'node_infiniband_port_data_received_bytes_total', - record: 'node_infiniband_port_data_received_bytes', - }, - { - expr: 'node_infiniband_port_data_transmitted_bytes_total', - record: 'node_infiniband_port_data_transmitted_bytes', - }, - ], - }, - { - name: 'node_exporter-16-interrupts', - rules: [ - { - expr: 'node_interrupts_total', - record: 'node_interrupts', - }, - ], - }, - { - name: 'node_exporter-16-memory', - rules: [ - { - expr: 'node_memory_Active_bytes', - record: 'node_memory_Active', - }, - { - expr: 'node_memory_Active_anon_bytes', - record: 'node_memory_Active_anon', - }, - { - expr: 'node_memory_Active_file_bytes', - record: 'node_memory_Active_file', - }, - { - expr: 'node_memory_AnonHugePages_bytes', - record: 'node_memory_AnonHugePages', - }, - { - expr: 'node_memory_AnonPages_bytes', - record: 'node_memory_AnonPages', - }, - { - expr: 'node_memory_Bounce_bytes', - record: 'node_memory_Bounce', - }, - { - expr: 'node_memory_Buffers_bytes', - record: 'node_memory_Buffers', - }, - { - expr: 'node_memory_Cached_bytes', - record: 'node_memory_Cached', - }, - { - expr: 'node_memory_CommitLimit_bytes', - record: 'node_memory_CommitLimit', - }, - { - expr: 'node_memory_Committed_AS_bytes', - record: 'node_memory_Committed_AS', - }, - { - expr: 'node_memory_DirectMap2M_bytes', - record: 'node_memory_DirectMap2M', - }, - { - expr: 'node_memory_DirectMap4k_bytes', - record: 'node_memory_DirectMap4k', - }, - { - expr: 'node_memory_Dirty_bytes', - record: 'node_memory_Dirty', - }, - { - expr: 'node_memory_HardwareCorrupted_bytes', - record: 'node_memory_HardwareCorrupted', - }, - { - expr: 'node_memory_Hugepagesize_bytes', - record: 'node_memory_Hugepagesize', - }, - { - expr: 'node_memory_Inactive_bytes', - record: 'node_memory_Inactive', - }, - { - expr: 'node_memory_Inactive_anon_bytes', - record: 'node_memory_Inactive_anon', - }, - { - expr: 'node_memory_Inactive_file_bytes', - record: 'node_memory_Inactive_file', - }, - { - expr: 'node_memory_KernelStack_bytes', - record: 'node_memory_KernelStack', - }, - { - expr: 'node_memory_Mapped_bytes', - record: 'node_memory_Mapped', - }, - { - expr: 'node_memory_MemAvailable_bytes', - record: 'node_memory_MemAvailable', - }, - { - expr: 'node_memory_MemFree_bytes', - record: 'node_memory_MemFree', - }, - { - expr: 'node_memory_MemTotal_bytes', - record: 'node_memory_MemTotal', - }, - { - expr: 'node_memory_Mlocked_bytes', - record: 'node_memory_Mlocked', - }, - { - expr: 'node_memory_NFS_Unstable_bytes', - record: 'node_memory_NFS_Unstable', - }, - { - expr: 'node_memory_PageTables_bytes', - record: 'node_memory_PageTables', - }, - { - expr: 'node_memory_Shmem_bytes', - record: 'node_memory_Shmem', - }, - { - expr: 'node_memory_Slab_bytes', - record: 'node_memory_Slab', - }, - { - expr: 'node_memory_SReclaimable_bytes', - record: 'node_memory_SReclaimable', - }, - { - expr: 'node_memory_SUnreclaim_bytes', - record: 'node_memory_SUnreclaim', - }, - { - expr: 'node_memory_SwapCached_bytes', - record: 'node_memory_SwapCached', - }, - { - expr: 'node_memory_SwapFree_bytes', - record: 'node_memory_SwapFree', - }, - { - expr: 'node_memory_SwapTotal_bytes', - record: 'node_memory_SwapTotal', - }, - { - expr: 'node_memory_Unevictable_bytes', - record: 'node_memory_Unevictable', - }, - { - expr: 'node_memory_VmallocChunk_bytes', - record: 'node_memory_VmallocChunk', - }, - { - expr: 'node_memory_VmallocTotal_bytes', - record: 'node_memory_VmallocTotal', - }, - { - expr: 'node_memory_VmallocUsed_bytes', - record: 'node_memory_VmallocUsed', - }, - { - expr: 'node_memory_Writeback_bytes', - record: 'node_memory_Writeback', - }, - { - expr: 'node_memory_WritebackTmp_bytes', - record: 'node_memory_WritebackTmp', - }, - ], - }, - { - name: 'node_exporter-16-network', - rules: [ - { - expr: 'node_network_receive_bytes_total', - record: 'node_network_receive_bytes', - }, - { - expr: 'node_network_receive_compressed_total', - record: 'node_network_receive_compressed', - }, - { - expr: 'node_network_receive_drop_total', - record: 'node_network_receive_drop', - }, - { - expr: 'node_network_receive_errs_total', - record: 'node_network_receive_errs', - }, - { - expr: 'node_network_receive_fifo_total', - record: 'node_network_receive_fifo', - }, - { - expr: 'node_network_receive_frame_total', - record: 'node_network_receive_frame', - }, - { - expr: 'node_network_receive_multicast_total', - record: 'node_network_receive_multicast', - }, - { - expr: 'node_network_receive_packets_total', - record: 'node_network_receive_packets', - }, - { - expr: 'node_network_transmit_bytes_total', - record: 'node_network_transmit_bytes', - }, - { - expr: 'node_network_transmit_compressed_total', - record: 'node_network_transmit_compressed', - }, - { - expr: 'node_network_transmit_drop_total', - record: 'node_network_transmit_drop', - }, - { - expr: 'node_network_transmit_errs_total', - record: 'node_network_transmit_errs', - }, - { - expr: 'node_network_transmit_fifo_total', - record: 'node_network_transmit_fifo', - }, - { - expr: 'node_network_transmit_frame_total', - record: 'node_network_transmit_frame', - }, - { - expr: 'node_network_transmit_multicast_total', - record: 'node_network_transmit_multicast', - }, - { - expr: 'node_network_transmit_packets_total', - record: 'node_network_transmit_packets', - }, - ], - }, - { - name: 'node_exporter-16-nfs', - rules: [ - { - expr: 'node_nfs_connections_total', - record: 'node_nfs_net_connections', - }, - { - expr: 'node_nfs_packets_total', - record: 'node_nfs_net_reads', - }, - { - expr: 'label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version", "(.+)"), "method", "$1", "procedure", "(.+)")', - record: 'node_nfs_procedures', - }, - { - expr: 'node_nfs_rpc_authentication_refreshes_total', - record: 'node_nfs_rpc_authentication_refreshes', - }, - { - expr: 'node_nfs_rpcs_total', - record: 'node_nfs_rpc_operations', - }, - { - expr: 'node_nfs_rpc_retransmissions_total', - record: 'node_nfs_rpc_retransmissions', - }, - ], - }, - { - name: 'node_exporter-16-textfile', - rules: [ - { - expr: 'node_textfile_mtime_seconds', - record: 'node_textfile_mtime', - }, - ], - }, - ], - }, -} diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/rules/rules.libsonnet index 6654e16b..b0217aba 100644 --- a/jsonnet/kube-prometheus/rules/rules.libsonnet +++ b/jsonnet/kube-prometheus/rules/rules.libsonnet @@ -1,2 +1 @@ -(import 'node-rules.libsonnet') + -(import 'node-exporter-v0.16.0-compatibility-rules.libsonnet') +(import 'node-rules.libsonnet') From fbcd551bb91558c517d7941af653a9f25b644e4f Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 7 Nov 2018 18:42:52 +0100 Subject: [PATCH 482/638] contrib/kube-prometheus: Generate manifests without compatibility node rules --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 195 -------------------------------- 2 files changed, 1 insertion(+), 196 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 8c6ef28c..69f284d0 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "c9350aab06b47bcf8410b597ba50b4addf21ee3d" + "version": "f67e5f2039fa8cf18a23b05e0cbaba706add90ec" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 6a5df8a3..28476cb3 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -282,201 +282,6 @@ spec: record: cluster:node_cpu:sum_rate5m - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) record: cluster:node_cpu:ratio - - name: node_exporter-16-bcache - rules: - - expr: node_bcache_cache_read_races - record: node_bcache_cache_read_races_total - - name: node_exporter-16-buddyinfo - rules: - - expr: node_buddyinfo_blocks - record: node_buddyinfo_count - - name: node_exporter-16-stat - rules: - - expr: node_boot_time_seconds - record: node_boot_time - - expr: node_context_switches_total - record: node_context_switches - - expr: node_forks_total - record: node_forks - - expr: node_intr_total - record: node_intr - - name: node_exporter-16-cpu - rules: - - expr: label_replace(node_cpu_seconds_total, "cpu", "$1", "cpu", "cpu(.+)") - record: node_cpu - - name: node_exporter-16-diskstats - rules: - - expr: node_disk_read_bytes_total - record: node_disk_bytes_read - - expr: node_disk_written_bytes_total - record: node_disk_bytes_written - - expr: node_disk_io_time_seconds_total * 1000 - record: node_disk_io_time_ms - - expr: node_disk_io_time_weighted_seconds_total - record: node_disk_io_time_weighted - - expr: node_disk_reads_completed_total - record: node_disk_reads_completed - - expr: node_disk_reads_merged_total - record: node_disk_reads_merged - - expr: node_disk_read_time_seconds_total * 1000 - record: node_disk_read_time_ms - - expr: node_disk_writes_completed_total - record: node_disk_writes_completed - - expr: node_disk_writes_merged_total - record: node_disk_writes_merged - - expr: node_disk_write_time_seconds_total * 1000 - record: node_disk_write_time_ms - - name: node_exporter-16-filesystem - rules: - - expr: node_filesystem_free_bytes - record: node_filesystem_free - - expr: node_filesystem_avail_bytes - record: node_filesystem_avail - - expr: node_filesystem_size_bytes - record: node_filesystem_size - - name: node_exporter-16-infiniband - rules: - - expr: node_infiniband_port_data_received_bytes_total - record: node_infiniband_port_data_received_bytes - - expr: node_infiniband_port_data_transmitted_bytes_total - record: node_infiniband_port_data_transmitted_bytes - - name: node_exporter-16-interrupts - rules: - - expr: node_interrupts_total - record: node_interrupts - - name: node_exporter-16-memory - rules: - - expr: node_memory_Active_bytes - record: node_memory_Active - - expr: node_memory_Active_anon_bytes - record: node_memory_Active_anon - - expr: node_memory_Active_file_bytes - record: node_memory_Active_file - - expr: node_memory_AnonHugePages_bytes - record: node_memory_AnonHugePages - - expr: node_memory_AnonPages_bytes - record: node_memory_AnonPages - - expr: node_memory_Bounce_bytes - record: node_memory_Bounce - - expr: node_memory_Buffers_bytes - record: node_memory_Buffers - - expr: node_memory_Cached_bytes - record: node_memory_Cached - - expr: node_memory_CommitLimit_bytes - record: node_memory_CommitLimit - - expr: node_memory_Committed_AS_bytes - record: node_memory_Committed_AS - - expr: node_memory_DirectMap2M_bytes - record: node_memory_DirectMap2M - - expr: node_memory_DirectMap4k_bytes - record: node_memory_DirectMap4k - - expr: node_memory_Dirty_bytes - record: node_memory_Dirty - - expr: node_memory_HardwareCorrupted_bytes - record: node_memory_HardwareCorrupted - - expr: node_memory_Hugepagesize_bytes - record: node_memory_Hugepagesize - - expr: node_memory_Inactive_bytes - record: node_memory_Inactive - - expr: node_memory_Inactive_anon_bytes - record: node_memory_Inactive_anon - - expr: node_memory_Inactive_file_bytes - record: node_memory_Inactive_file - - expr: node_memory_KernelStack_bytes - record: node_memory_KernelStack - - expr: node_memory_Mapped_bytes - record: node_memory_Mapped - - expr: node_memory_MemAvailable_bytes - record: node_memory_MemAvailable - - expr: node_memory_MemFree_bytes - record: node_memory_MemFree - - expr: node_memory_MemTotal_bytes - record: node_memory_MemTotal - - expr: node_memory_Mlocked_bytes - record: node_memory_Mlocked - - expr: node_memory_NFS_Unstable_bytes - record: node_memory_NFS_Unstable - - expr: node_memory_PageTables_bytes - record: node_memory_PageTables - - expr: node_memory_Shmem_bytes - record: node_memory_Shmem - - expr: node_memory_Slab_bytes - record: node_memory_Slab - - expr: node_memory_SReclaimable_bytes - record: node_memory_SReclaimable - - expr: node_memory_SUnreclaim_bytes - record: node_memory_SUnreclaim - - expr: node_memory_SwapCached_bytes - record: node_memory_SwapCached - - expr: node_memory_SwapFree_bytes - record: node_memory_SwapFree - - expr: node_memory_SwapTotal_bytes - record: node_memory_SwapTotal - - expr: node_memory_Unevictable_bytes - record: node_memory_Unevictable - - expr: node_memory_VmallocChunk_bytes - record: node_memory_VmallocChunk - - expr: node_memory_VmallocTotal_bytes - record: node_memory_VmallocTotal - - expr: node_memory_VmallocUsed_bytes - record: node_memory_VmallocUsed - - expr: node_memory_Writeback_bytes - record: node_memory_Writeback - - expr: node_memory_WritebackTmp_bytes - record: node_memory_WritebackTmp - - name: node_exporter-16-network - rules: - - expr: node_network_receive_bytes_total - record: node_network_receive_bytes - - expr: node_network_receive_compressed_total - record: node_network_receive_compressed - - expr: node_network_receive_drop_total - record: node_network_receive_drop - - expr: node_network_receive_errs_total - record: node_network_receive_errs - - expr: node_network_receive_fifo_total - record: node_network_receive_fifo - - expr: node_network_receive_frame_total - record: node_network_receive_frame - - expr: node_network_receive_multicast_total - record: node_network_receive_multicast - - expr: node_network_receive_packets_total - record: node_network_receive_packets - - expr: node_network_transmit_bytes_total - record: node_network_transmit_bytes - - expr: node_network_transmit_compressed_total - record: node_network_transmit_compressed - - expr: node_network_transmit_drop_total - record: node_network_transmit_drop - - expr: node_network_transmit_errs_total - record: node_network_transmit_errs - - expr: node_network_transmit_fifo_total - record: node_network_transmit_fifo - - expr: node_network_transmit_frame_total - record: node_network_transmit_frame - - expr: node_network_transmit_multicast_total - record: node_network_transmit_multicast - - expr: node_network_transmit_packets_total - record: node_network_transmit_packets - - name: node_exporter-16-nfs - rules: - - expr: node_nfs_connections_total - record: node_nfs_net_connections - - expr: node_nfs_packets_total - record: node_nfs_net_reads - - expr: label_replace(label_replace(node_nfs_requests_total, "proto", "$1", "version", - "(.+)"), "method", "$1", "procedure", "(.+)") - record: node_nfs_procedures - - expr: node_nfs_rpc_authentication_refreshes_total - record: node_nfs_rpc_authentication_refreshes - - expr: node_nfs_rpcs_total - record: node_nfs_rpc_operations - - expr: node_nfs_rpc_retransmissions_total - record: node_nfs_rpc_retransmissions - - name: node_exporter-16-textfile - rules: - - expr: node_textfile_mtime_seconds - record: node_textfile_mtime - name: kubernetes-absent rules: - alert: AlertmanagerDown From a7ba98a92df40ec9fb4ac043c938e4cbabe8fbc8 Mon Sep 17 00:00:00 2001 From: Vasily Sliouniaev Date: Thu, 15 Nov 2018 08:28:13 +0000 Subject: [PATCH 483/638] Update lockfile --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 69f284d0..da8716c5 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "f67e5f2039fa8cf18a23b05e0cbaba706add90ec" + "version": "fb5adf87615bb46df0ccf44819f7e59e1b68c6e8" }, { "name": "ksonnet", From 283d34b882e6bfc0f62abfd4b1c1ac113175e4ed Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Fri, 9 Nov 2018 12:14:19 +1100 Subject: [PATCH 484/638] add example for kube-aws --- README.md | 8 ++++++++ examples/jsonnet-snippets/kube-aws.jsonnet | 2 ++ .../kube-prometheus-kube-aws.libsonnet | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 examples/jsonnet-snippets/kube-aws.jsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet diff --git a/README.md b/README.md index 04c7eb86..b8f69a8b 100644 --- a/README.md +++ b/README.md @@ -367,6 +367,14 @@ kubespray: (import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') ``` +kube-aws: + +[embedmd]:# (examples/jsonnet-snippets/kube-aws.jsonnet) +```jsonnet +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') +``` + ### Internal Registry Some Kubernetes installations source all their images from an internal registry. kube-prometheus supports this use case and helps the user synchronize every image it uses to the internal registry and generate manifests pointing at the internal registry. diff --git a/examples/jsonnet-snippets/kube-aws.jsonnet b/examples/jsonnet-snippets/kube-aws.jsonnet new file mode 100644 index 00000000..b0842eb2 --- /dev/null +++ b/examples/jsonnet-snippets/kube-aws.jsonnet @@ -0,0 +1,2 @@ +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') diff --git a/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet new file mode 100644 index 00000000..8a69d215 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet @@ -0,0 +1,18 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + }, +} From c066b4bc893b142b00e12a3148039b9fe4b029b1 Mon Sep 17 00:00:00 2001 From: Vincent Brouillet Date: Tue, 20 Nov 2018 11:06:00 +1100 Subject: [PATCH 485/638] rebase --- jsonnetfile.lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index da8716c5..0058ea87 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "fb5adf87615bb46df0ccf44819f7e59e1b68c6e8" + "version": "677cb68995f2632cd5a1ecb26e52e3a7c743322b" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "f7ca48cca5d9cadc9a2203b8c0b3bb3eb85f3294" + "version": "c0b31ea63564966021f9e6010090acded475b192" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "d407225c5a2e087eb68843528aab2be0507c73b8" + "version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "90fbdbf08cf0d4bdc78ab52151041da36a7b0abc" + "version": "5b6050e8e883f24b508a18d4b02d1637ec4a540a" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "455e08134e1a135f41e1032576487921a759cf51" + "version": "3df5e36ecbf348a13e155e12c495ac9fd05030b6" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "ee9dcbca0d89dc563c9e6bc725fab0c6f21d689b" + "version": "bb25891960b9ebbe0f526d1e067f94906d6fb58f" } ] } From 0dec594c417b2423f42b033552c8ccef661f1034 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Nov 2018 14:10:41 +0100 Subject: [PATCH 486/638] kube-prometheus: Re-generate --- manifests/grafana-deployment.yaml | 4 ++++ manifests/prometheus-rules.yaml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index e378f689..6816ce2f 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -21,6 +21,10 @@ spec: ports: - containerPort: 3000 name: http + readinessProbe: + httpGet: + path: /api/health + port: http resources: limits: cpu: 200m diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 28476cb3..ad3a6065 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -595,8 +595,8 @@ spec: }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by - (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total[5m])) + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) + by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25 \n" for: 15m labels: From 43bb05692fe3c6f0acc42da5db68786ca5dbde6d Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Tue, 20 Nov 2018 17:53:42 +0100 Subject: [PATCH 487/638] contrib/kube-prometheus: ne rbacproxy listen podip This commit adjusts the RBAC proxy for the node-exporter DaemonSet to only listen on the Pod IP. It also adjusts the ports used by the node-exporter Pod so that both containers are listening on 9100. The actual node-exporter listens on 127.0.0.1:9100, while the RBAC proxy listens on :9100. This ensures that port 9101 is not taken on the host networking namespace. --- .../node-exporter/node-exporter.libsonnet | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 8ac3d73e..3524e11a 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -58,6 +58,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local containerVolumeMount = container.volumeMountsType; local podSelector = daemonset.mixin.spec.template.spec.selectorType; local toleration = daemonset.mixin.spec.template.spec.tolerationsType; + local containerEnv = container.envType; local podLabels = { app: 'node-exporter' }; @@ -82,7 +83,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local nodeExporter = container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + container.withArgs([ - '--web.listen-address=127.0.0.1:9101', + '--web.listen-address=127.0.0.1:9100', '--path.procfs=/host/proc', '--path.sysfs=/host/sys', @@ -96,15 +97,25 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + local ip = containerEnv.fromFieldPath('IP', 'status.podIP'); local proxy = container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ - '--secure-listen-address=:9100', - '--upstream=http://127.0.0.1:9101/', + '--secure-listen-address=$(IP):9100', + '--upstream=http://127.0.0.1:9100/', ]) + + // Keep `hostPort` here, rather than in the node-exporter container + // because Kubernetes mandates that if you define a `hostPort` then + // `containerPort` must match. In our case, we are splitting the + // host port and container port between the two containers. + // We'll keep the port specification here so that the named port + // used by the service is tied to the proxy container. We *could* + // forgo declaring the host port, however it is important to declare + // it so that the scheduler can decide if the pod is schedulable. container.withPorts(containerPort.new(9100) + containerPort.withHostPort(9100) + containerPort.withName('https')) + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + - container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }) + + container.withEnv([ip]); local c = [nodeExporter, proxy]; From 7b5f15ff8401b313e1d6c9f2601d3cffbfa44926 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Tue, 20 Nov 2018 20:03:10 +0100 Subject: [PATCH 488/638] contrib: regenerate --- jsonnetfile.lock.json | 2 +- manifests/node-exporter-daemonset.yaml | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 0058ea87..4cab772d 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "677cb68995f2632cd5a1ecb26e52e3a7c743322b" + "version": "f2724c252dad424580f3d5061304f88b4e1a2bb5" }, { "name": "ksonnet", diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index ee084f0a..8f5f4ff4 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - args: - - --web.listen-address=127.0.0.1:9101 + - --web.listen-address=127.0.0.1:9100 - --path.procfs=/host/proc - --path.sysfs=/host/sys - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) @@ -42,8 +42,13 @@ spec: name: root readOnly: true - args: - - --secure-listen-address=:9100 - - --upstream=http://127.0.0.1:9101/ + - --secure-listen-address=$(IP):9100 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy ports: From d7fc1eb32b53b914820d7777698f71f9a339234c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 21 Nov 2018 16:22:06 +0100 Subject: [PATCH 489/638] kube-prometheus: Adapt CoreDNS default configuration to default CoreDNS installs --- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 2 ++ jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 2dd32b50..f59cc80f 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -29,6 +29,7 @@ local configMapList = k.core.v1.configMapList; kubeSchedulerSelector: 'job="kube-scheduler"', kubeControllerManagerSelector: 'job="kube-controller-manager"', kubeApiserverSelector: 'job="apiserver"', + coreDNSSelector: 'job="kube-dns"', podLabel: 'pod', alertmanagerSelector: 'job="alertmanager-main"', @@ -45,6 +46,7 @@ local configMapList = k.core.v1.configMapList; Alertmanager: $._config.alertmanagerSelector, Prometheus: $._config.prometheusSelector, PrometheusOperator: $._config.prometheusOperatorSelector, + CoreDNS: $._config.coreDNSSelector, }, prometheus+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index fafcd4d6..f9de222e 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -386,11 +386,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, }, spec: { - jobLabel: 'k8s-app', selector: { matchLabels: { - 'k8s-app': 'coredns', - component: 'metrics', + 'k8s-app': 'kube-dns', }, }, namespaceSelector: { @@ -400,7 +398,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, endpoints: [ { - port: 'http-metrics', + port: 'metrics', interval: '15s', bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', }, From dddcdb223a8deae5e386c9c95bb1b0391e4a2ef1 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 21 Nov 2018 16:25:48 +0100 Subject: [PATCH 490/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 4 ++-- manifests/prometheus-rules.yaml | 9 +++++++++ manifests/prometheus-serviceMonitorCoreDNS.yaml | 6 ++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 4cab772d..3b9d80cd 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "f2724c252dad424580f3d5061304f88b4e1a2bb5" + "version": "dff8f44fbce268596c86b8d586c64c17953feab3" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "bb25891960b9ebbe0f526d1e067f94906d6fb58f" + "version": "02a9810a9e4e5c95feed4a6d6d2c5525fe2af1c1" } ] } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index ad3a6065..a7987abc 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -293,6 +293,15 @@ spec: for: 15m labels: severity: critical + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown + expr: | + absent(up{job="kube-dns"} == 1) + for: 15m + labels: + severity: critical - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml index 12a4c5bf..14a24545 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/prometheus-serviceMonitorCoreDNS.yaml @@ -9,12 +9,10 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s - port: http-metrics - jobLabel: k8s-app + port: metrics namespaceSelector: matchNames: - kube-system selector: matchLabels: - component: metrics - k8s-app: coredns + k8s-app: kube-dns From d823b7f74bce2042dc34f0749d3cc5b483f8d341 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 22 Nov 2018 15:28:14 +0100 Subject: [PATCH 491/638] contrib/kube-prometheus: Add prometheusURL to adapter's jsonnet config --- .../prometheus-adapter/prometheus-adapter.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 16189d39..ac675931 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -15,6 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheusAdapter+:: { name: 'prometheus-adapter', labels: { name: $._config.prometheusAdapter.name }, + prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc:9090/', config: ||| resourceRules: cpu: @@ -98,7 +99,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--config=/etc/adapter/config.yaml', '--logtostderr=true', '--metrics-relist-interval=1m', - '--prometheus-url=http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc:9090/', + '--prometheus-url=' + $._config.prometheusAdapter.prometheusURL, '--secure-port=6443', ]) + container.withPorts([{ containerPort: 6443 }]) + From 95bb49aa4e19283017bc3d49782f6fe31593ef7e Mon Sep 17 00:00:00 2001 From: Golubkov Igor Date: Tue, 20 Nov 2018 15:51:57 +0300 Subject: [PATCH 492/638] Fix AlertmanagerMembersInconsistent rule --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index 42ae3f98..bf58862d 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -37,7 +37,7 @@ }, expr: ||| alertmanager_cluster_members{%(alertmanagerSelector)s} - != on (service) + != on (service) GROUP_LEFT() count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s}) ||| % $._config, 'for': '5m', From 16b147863cee9723a1b960ccbcd7b809a0063eee Mon Sep 17 00:00:00 2001 From: Golubkov Igor Date: Thu, 22 Nov 2018 17:15:25 +0300 Subject: [PATCH 493/638] Update contrib/kube-prometheus --- jsonnetfile.lock.json | 2 +- manifests/prometheus-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 3b9d80cd..3665f14f 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "dff8f44fbce268596c86b8d586c64c17953feab3" + "version": "9cc151ced4308573a91f4cc3fcdbc951213b03e0" }, { "name": "ksonnet", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index a7987abc..872cf310 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -790,7 +790,7 @@ spec: message: Alertmanager has not found all other members of the cluster. expr: | alertmanager_cluster_members{job="alertmanager-main"} - != on (service) + != on (service) GROUP_LEFT() count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) for: 5m labels: From 48ab110e09c8bbaa7f12f2667032ac1a86b5a6f8 Mon Sep 17 00:00:00 2001 From: Golubkov Igor Date: Thu, 22 Nov 2018 17:25:36 +0300 Subject: [PATCH 494/638] Update unit tests --- jsonnet/kube-prometheus/alerts/tests.yaml | 113 ++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/jsonnet/kube-prometheus/alerts/tests.yaml b/jsonnet/kube-prometheus/alerts/tests.yaml index 8cfc3aa7..532bb895 100644 --- a/jsonnet/kube-prometheus/alerts/tests.yaml +++ b/jsonnet/kube-prometheus/alerts/tests.yaml @@ -24,6 +24,10 @@ tests: - exp_labels: service: 'alertmanager-main' severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 exp_annotations: message: 'Alertmanager has not found all other members of the cluster.' - eval_time: 17m @@ -32,6 +36,10 @@ tests: - exp_labels: service: 'alertmanager-main' severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 exp_annotations: message: 'Alertmanager has not found all other members of the cluster.' - eval_time: 23m @@ -40,5 +48,110 @@ tests: - exp_labels: service: 'alertmanager-main' severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - interval: 1m + input_series: + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' + values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' + values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' + - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' + values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' + alert_rule_test: + - eval_time: 5m + alertname: AlertmanagerMembersInconsistent + - eval_time: 11m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.1 + namespace: monitoring + pod: alertmanager-main-1 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.2 + namespace: monitoring + pod: alertmanager-main-2 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - eval_time: 17m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.1 + namespace: monitoring + pod: alertmanager-main-1 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.2 + namespace: monitoring + pod: alertmanager-main-2 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - eval_time: 23m + alertname: AlertmanagerMembersInconsistent + exp_alerts: + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.0 + namespace: monitoring + pod: alertmanager-main-0 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.1 + namespace: monitoring + pod: alertmanager-main-1 + exp_annotations: + message: 'Alertmanager has not found all other members of the cluster.' + - exp_labels: + service: 'alertmanager-main' + severity: critical + job: 'alertmanager-main' + instance: 10.10.10.2 + namespace: monitoring + pod: alertmanager-main-2 exp_annotations: message: 'Alertmanager has not found all other members of the cluster.' From 4a15683db43d43f9575c7340019ee5ca5f7625be Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 27 Nov 2018 15:29:46 +0000 Subject: [PATCH 495/638] Do not error if no $._config.prometheus.rules are set --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index f9de222e..d0ce82c2 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -40,7 +40,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.new('prometheus-' + $._config.prometheus.name, { app: 'prometheus', prometheus: $._config.prometheus.name }, prometheusPort) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), - rules: + [if $._config.prometheus.rules != null and $._config.prometheus.rules != {} then "rules"]: { apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', From db07ca6e128412afa25d121c61de33eed342c22d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 27 Nov 2018 16:49:29 +0000 Subject: [PATCH 496/638] Change examples to be more robust against customizations Explicitly reference values we are interesting in (jsonnet style) instead of just following conventions (helm style) --- docs/exposing-prometheus-alertmanager-grafana-ingress.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index 9874dc20..007dc641 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -50,8 +50,8 @@ local kp = ingress+:: { 'prometheus-k8s': ingress.new() + - ingress.mixin.metadata.withName('prometheus-k8s') + - ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withName($.prometheus.prometheus.metadata.name) + + ingress.mixin.metadata.withNamespace($.prometheus.prometheus.metadata.namespace) + ingress.mixin.metadata.withAnnotations({ 'nginx.ingress.kubernetes.io/auth-type': 'basic', 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', @@ -62,7 +62,7 @@ local kp = ingressRule.withHost('prometheus.example.com') + ingressRule.mixin.http.withPaths( httpIngressPath.new() + - httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + + httpIngressPath.mixin.backend.withServiceName($.prometheus.service.metadata.name) + httpIngressPath.mixin.backend.withServicePort('web') ), ), From b9a6730daf133197f36303461f8224343b32e3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20St=C3=A4ber?= Date: Wed, 28 Nov 2018 10:20:23 +0100 Subject: [PATCH 497/638] fix syntax error in prometheus.libsonnet --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index d0ce82c2..8d19c456 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -40,7 +40,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.new('prometheus-' + $._config.prometheus.name, { app: 'prometheus', prometheus: $._config.prometheus.name }, prometheusPort) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), - [if $._config.prometheus.rules != null and $._config.prometheus.rules != {} then "rules"]: + [if $._config.prometheus.rules != null && $._config.prometheus.rules != {} then "rules"]: { apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', From 989456cde9556b51bcf717247ea41ffe3ef934c1 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 20 Nov 2018 16:27:52 +0100 Subject: [PATCH 498/638] *: Increase CPU limits for small containers to not being throttled as much --- .../kube-state-metrics/kube-state-metrics.libsonnet | 2 +- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 167e4632..a3bb0ec6 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -182,7 +182,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, ]) + container.mixin.resources.withRequests({ cpu: '10m', memory: '30Mi' }) + - container.mixin.resources.withLimits({ cpu: '10m', memory: '30Mi' }); + container.mixin.resources.withLimits({ cpu: '50m', memory: '30Mi' }); local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics, addonResizer]; diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 3524e11a..c9e1faeb 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -95,7 +95,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; ]) + container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) + container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + - container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + container.mixin.resources.withLimits({ cpu: '250m', memory: '180Mi' }); local ip = containerEnv.fromFieldPath('IP', 'status.podIP'); local proxy = From d22cf7477e8b00f0fd9bc2755c3009178845ad40 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 28 Nov 2018 14:31:11 +0100 Subject: [PATCH 499/638] contrib/kube-prometheus: Generate manifests with higher limits --- jsonnetfile.lock.json | 2 +- manifests/kube-state-metrics-deployment.yaml | 2 +- manifests/node-exporter-daemonset.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 3665f14f..5fa07480 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9cc151ced4308573a91f4cc3fcdbc951213b03e0" + "version": "606a53d42a836baa950f138be43fae7ae98821cd" }, { "name": "ksonnet", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 483b7c6d..87aa40af 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -84,7 +84,7 @@ spec: name: addon-resizer resources: limits: - cpu: 10m + cpu: 50m memory: 30Mi requests: cpu: 10m diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 8f5f4ff4..28d424b8 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -25,7 +25,7 @@ spec: name: node-exporter resources: limits: - cpu: 102m + cpu: 250m memory: 180Mi requests: cpu: 102m From 2f23e374c7147c9358fd826c202a85be9ca32489 Mon Sep 17 00:00:00 2001 From: Stefan Knott Date: Wed, 28 Nov 2018 15:49:26 +0100 Subject: [PATCH 500/638] fix breaking typo the operator spec key is named "externalUrl", rather than "externalURL" --- docs/exposing-prometheus-alertmanager-grafana-ingress.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index 007dc641..7fb2d4f6 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -43,7 +43,7 @@ local kp = prometheus+:: { prometheus+: { spec+: { - externalURL: 'http://prometheus.example.com', + externalUrl: 'http://prometheus.example.com', }, }, }, From 64a5d10227a112000699bb0a1f89163df50fcfd7 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Fri, 30 Nov 2018 14:14:16 +0100 Subject: [PATCH 501/638] kube-prometheus: bump prometheus-operator --- jsonnet/kube-prometheus/jsonnetfile.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index d42e422a..bae1e245 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.25.0" + "version": "v0.26.0" }, { "name": "etcd-mixin", @@ -51,4 +51,4 @@ "version": "master" } ] -} \ No newline at end of file +} From fc653360821d41ba8d75e9c298e6a8efcd49e859 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Fri, 30 Nov 2018 14:28:45 +0100 Subject: [PATCH 502/638] kube-prometheus: regenerate --- jsonnetfile.lock.json | 10 +-- ...0alertmanagerCustomResourceDefinition.yaml | 66 +------------------ ...r-0prometheusCustomResourceDefinition.yaml | 62 ----------------- .../0prometheus-operator-deployment.yaml | 4 +- manifests/prometheus-rules.yaml | 8 +-- 5 files changed, 13 insertions(+), 137 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5fa07480..0e75a6d8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "606a53d42a836baa950f138be43fae7ae98821cd" + "version": "433616b23b9c4bce759bc99c35ca2a66348c36b8" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "c0b31ea63564966021f9e6010090acded475b192" + "version": "64a06754786ca1a28929b9fb05c381085dcdd44c" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "5b6050e8e883f24b508a18d4b02d1637ec4a540a" + "version": "d60a39a5c01f651fdfef2db7a710bb5319b0dbc4" }, { "name": "grafana", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "82a6ad2071ff653e38b3b4719ecb789d73f3ab05" + "version": "72ec4b9b16ef11700724dc71fec77112536eed40" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "02a9810a9e4e5c95feed4a6d6d2c5525fe2af1c1" + "version": "dedae6eb7c253635e70403a1fb04842700277b23" } ] } diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 22248a54..d5c94fc9 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1713,8 +1713,8 @@ spec: type: object retention: description: Time duration Alertmanager shall retain data for. Default - is '120h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` - (milliseconds seconds minutes hours days weeks years). + is '120h', and must match the regular expression `[0-9]+(ms|s|m|h)` + (milliseconds seconds minutes hours). type: string routePrefix: description: The route prefix Alertmanager registers HTTP handlers for. @@ -1830,11 +1830,6 @@ spec: is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) will be used. properties: - class: - description: 'Name of the StorageClass to use when requesting storage - provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses - (DEPRECATED - instead use `volumeClaimTemplate.spec.storageClassName`)' - type: string emptyDir: description: Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling. @@ -1845,63 +1840,6 @@ spec: Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: {} - resources: - description: ResourceRequirements describes the compute resource - requirements. - properties: - limits: - description: 'Limits describes the maximum amount of compute - resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' - type: object - requests: - description: 'Requests describes the minimum amount of compute - resources required. If Requests is omitted for a container, - it defaults to Limits if that is explicitly specified, otherwise - to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' - type: object - selector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An empty - label selector matches all objects. A null label selector matches - no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement is a selector that - contains values, a key, and an operator that relates the - key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, Exists - and DoesNotExist. - type: string - values: - description: values is an array of string values. If the - operator is In or NotIn, the values array must be non-empty. - If the operator is Exists or DoesNotExist, the values - array must be empty. This array is replaced during a - strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: array - matchLabels: - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator - is "In", and the values array contains only "value". The requirements - are ANDed. - type: object volumeClaimTemplate: description: PersistentVolumeClaim is a user's request for and claim to a persistent volume diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 58265136..d825277e 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2416,11 +2416,6 @@ spec: is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) will be used. properties: - class: - description: 'Name of the StorageClass to use when requesting storage - provisioning. More info: https://kubernetes.io/docs/user-guide/persistent-volumes/#storageclasses - (DEPRECATED - instead use `volumeClaimTemplate.spec.storageClassName`)' - type: string emptyDir: description: Represents an empty directory for a pod. Empty directory volumes support ownership management and SELinux relabeling. @@ -2431,63 +2426,6 @@ spec: Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: {} - resources: - description: ResourceRequirements describes the compute resource - requirements. - properties: - limits: - description: 'Limits describes the maximum amount of compute - resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' - type: object - requests: - description: 'Requests describes the minimum amount of compute - resources required. If Requests is omitted for a container, - it defaults to Limits if that is explicitly specified, otherwise - to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' - type: object - selector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An empty - label selector matches all objects. A null label selector matches - no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement is a selector that - contains values, a key, and an operator that relates the - key and values. - properties: - key: - description: key is the label key that the selector applies - to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, Exists - and DoesNotExist. - type: string - values: - description: values is an array of string values. If the - operator is In or NotIn, the values array must be non-empty. - If the operator is Exists or DoesNotExist, the values - array must be empty. This array is replaced during a - strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: array - matchLabels: - description: matchLabels is a map of {key,value} pairs. A single - {key,value} in the matchLabels map is equivalent to an element - of matchExpressions, whose key field is "key", the operator - is "In", and the values array contains only "value". The requirements - are ANDed. - type: object volumeClaimTemplate: description: PersistentVolumeClaim is a user's request for and claim to a persistent volume diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index a82bf6f3..1ddbae2f 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.25.0 - image: quay.io/coreos/prometheus-operator:v0.25.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.26.0 + image: quay.io/coreos/prometheus-operator:v0.26.0 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 872cf310..01ca8ddb 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -213,21 +213,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) From 5d20f0943efddad320c0c401decb2fbade68b1b6 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Tue, 4 Dec 2018 08:42:26 +1300 Subject: [PATCH 503/638] example(ingress): Fix ExternalUrl key - see 2172 --- examples/ingress.jsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 7b89094f..4aba212a 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -15,7 +15,7 @@ local kp = alertmanager+:: { alertmanager+: { spec+: { - externalURL: 'http://alertmanager.example.com', + externalUrl: 'http://alertmanager.example.com', }, }, }, @@ -31,7 +31,7 @@ local kp = prometheus+:: { prometheus+: { spec+: { - externalURL: 'http://prometheus.example.com', + externalUrl: 'http://prometheus.example.com', }, }, }, From dee254fa01293618dc8dd77cc228c157234f9c85 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 7 Dec 2018 17:32:34 +0100 Subject: [PATCH 504/638] contrib/kube-prometheus: Set securityContext in the manifests --- .../kube-prometheus/alertmanager/alertmanager.libsonnet | 5 +++++ jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index e109b0ad..347d9a3c 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -113,6 +113,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; baseImage: $._config.imageRepos.alertmanager, nodeSelector: { 'beta.kubernetes.io/os': 'linux' }, serviceAccountName: 'alertmanager-' + $._config.alertmanager.name, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, }, }, }, diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 8d19c456..c745f1c4 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -40,7 +40,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.new('prometheus-' + $._config.prometheus.name, { app: 'prometheus', prometheus: $._config.prometheus.name }, prometheusPort) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), - [if $._config.prometheus.rules != null && $._config.prometheus.rules != {} then "rules"]: + [if $._config.prometheus.rules != null && $._config.prometheus.rules != {} then 'rules']: { apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', @@ -185,6 +185,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, ], }, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, }, }, serviceMonitor: From 88e11d48c238a781222f61892afb2105e3f6d7d1 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 7 Dec 2018 17:37:24 +0100 Subject: [PATCH 505/638] *: Regenerate after adding securityContext to manifests --- jsonnetfile.lock.json | 2 +- manifests/alertmanager-alertmanager.yaml | 4 ++++ manifests/prometheus-prometheus.yaml | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 0e75a6d8..e05e4e0a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "433616b23b9c4bce759bc99c35ca2a66348c36b8" + "version": "cc1d3b421e00f8891582ba9692b78814220c69c6" }, { "name": "ksonnet", diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 2230ea9e..c6f8ce05 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -10,5 +10,9 @@ spec: nodeSelector: beta.kubernetes.io/os: linux replicas: 3 + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 serviceAccountName: alertmanager-main version: v0.15.3 diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 94fd64dc..c16914b0 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -22,6 +22,10 @@ spec: matchLabels: prometheus: k8s role: alert-rules + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} From 12eaf2886659560c6d57d3919494f6e8715dc1f7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 10 Dec 2018 18:24:52 +0100 Subject: [PATCH 506/638] contrib/kube-prometheus: Set Prometheus Adapter's maxSurge to 1 --- .../prometheus-adapter/prometheus-adapter.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index ac675931..8a67e887 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheusAdapter: 'v0.3.0', + prometheusAdapter: 'v0.4.0', }, imageRepos+:: { @@ -113,6 +113,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; deployment.mixin.metadata.withNamespace($._config.namespace) + deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) + deployment.mixin.spec.template.spec.withVolumes([ volume.fromEmptyDir(name='tmpfs'), volume.fromEmptyDir(name='volume-serving-cert'), From a8b036e67d02977e07c212b8af03cbb745a8472d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 10 Dec 2018 18:31:09 +0100 Subject: [PATCH 507/638] contrib/kube-prometheus: Generate Prometheus Adapter with maxSurge --- jsonnetfile.lock.json | 2 +- manifests/prometheus-adapter-deployment.yaml | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e05e4e0a..d8455b56 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "cc1d3b421e00f8891582ba9692b78814220c69c6" + "version": "920b29babc4f4e490170b73aba2f9de86e0a08b6" }, { "name": "ksonnet", diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index 9d28503c..8772a7a5 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -8,6 +8,10 @@ spec: selector: matchLabels: name: prometheus-adapter + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 template: metadata: labels: @@ -21,7 +25,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.3.0 + image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.0 name: prometheus-adapter ports: - containerPort: 6443 From b27b69e162c5cd2e9db24b7ab18973f92dc4d486 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 11 Dec 2018 15:34:02 +0100 Subject: [PATCH 508/638] kube-prometheus: Add poddisruptionbudgets to ksm cluster role This patch allows kube-state-metrics to list and watch instances of poddisruptionbudgets from the policy API group. --- .../kube-state-metrics.libsonnet | 97 ++++++++++--------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index a3bb0ec6..153d7b08 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -41,11 +41,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clusterRole: local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; + local rulesType = clusterRole.rulesType; - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ + local coreRule = rulesType.new() + + rulesType.withApiGroups(['']) + + rulesType.withResources([ 'configmaps', 'secrets', 'nodes', @@ -59,57 +59,64 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'namespaces', 'endpoints', ]) + - policyRule.withVerbs(['list', 'watch']); + rulesType.withVerbs(['list', 'watch']); - local extensionsRule = policyRule.new() + - policyRule.withApiGroups(['extensions']) + - policyRule.withResources([ + local extensionsRule = rulesType.new() + + rulesType.withApiGroups(['extensions']) + + rulesType.withResources([ 'daemonsets', 'deployments', 'replicasets', ]) + - policyRule.withVerbs(['list', 'watch']); + rulesType.withVerbs(['list', 'watch']); - local appsRule = policyRule.new() + - policyRule.withApiGroups(['apps']) + - policyRule.withResources([ + local appsRule = rulesType.new() + + rulesType.withApiGroups(['apps']) + + rulesType.withResources([ 'statefulsets', 'daemonsets', 'deployments', 'replicasets', ]) + - policyRule.withVerbs(['list', 'watch']); + rulesType.withVerbs(['list', 'watch']); - local batchRule = policyRule.new() + - policyRule.withApiGroups(['batch']) + - policyRule.withResources([ + local batchRule = rulesType.new() + + rulesType.withApiGroups(['batch']) + + rulesType.withResources([ 'cronjobs', 'jobs', ]) + - policyRule.withVerbs(['list', 'watch']); + rulesType.withVerbs(['list', 'watch']); - local autoscalingRule = policyRule.new() + - policyRule.withApiGroups(['autoscaling']) + - policyRule.withResources([ + local autoscalingRule = rulesType.new() + + rulesType.withApiGroups(['autoscaling']) + + rulesType.withResources([ 'horizontalpodautoscalers', ]) + - policyRule.withVerbs(['list', 'watch']); + rulesType.withVerbs(['list', 'watch']); - local authenticationRole = policyRule.new() + - policyRule.withApiGroups(['authentication.k8s.io']) + - policyRule.withResources([ + local authenticationRole = rulesType.new() + + rulesType.withApiGroups(['authentication.k8s.io']) + + rulesType.withResources([ 'tokenreviews', ]) + - policyRule.withVerbs(['create']); + rulesType.withVerbs(['create']); - local authorizationRole = policyRule.new() + - policyRule.withApiGroups(['authorization.k8s.io']) + - policyRule.withResources([ + local authorizationRole = rulesType.new() + + rulesType.withApiGroups(['authorization.k8s.io']) + + rulesType.withResources([ 'subjectaccessreviews', ]) + - policyRule.withVerbs(['create']); + rulesType.withVerbs(['create']); - local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole]; + local policyRule = rulesType.new() + + rulesType.withApiGroups(['policy']) + + rulesType.withResources([ + 'poddisruptionbudgets', + ]) + + rulesType.withVerbs(['list', 'watch']); + + local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole, policyRule]; clusterRole.new() + clusterRole.mixin.metadata.withName('kube-state-metrics') + @@ -208,30 +215,30 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; role: local role = k.rbac.v1.role; - local policyRule = role.rulesType; + local rulesType = role.rulesType; - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ + local coreRule = rulesType.new() + + rulesType.withApiGroups(['']) + + rulesType.withResources([ 'pods', ]) + - policyRule.withVerbs(['get']); + rulesType.withVerbs(['get']); - local extensionsRule = policyRule.new() + - policyRule.withApiGroups(['extensions']) + - policyRule.withResources([ + local extensionsRule = rulesType.new() + + rulesType.withApiGroups(['extensions']) + + rulesType.withResources([ 'deployments', ]) + - policyRule.withVerbs(['get', 'update']) + - policyRule.withResourceNames(['kube-state-metrics']); + rulesType.withVerbs(['get', 'update']) + + rulesType.withResourceNames(['kube-state-metrics']); - local appsRule = policyRule.new() + - policyRule.withApiGroups(['apps']) + - policyRule.withResources([ + local appsRule = rulesType.new() + + rulesType.withApiGroups(['apps']) + + rulesType.withResources([ 'deployments', ]) + - policyRule.withVerbs(['get', 'update']) + - policyRule.withResourceNames(['kube-state-metrics']); + rulesType.withVerbs(['get', 'update']) + + rulesType.withResourceNames(['kube-state-metrics']); local rules = [coreRule, extensionsRule, appsRule]; From 9c467d233399f3a3924bc2fc1bccace3216a2f95 Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 11 Dec 2018 18:09:52 +0100 Subject: [PATCH 509/638] kube-prometheus: Update manifests --- jsonnetfile.lock.json | 2 +- manifests/kube-state-metrics-clusterRole.yaml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index d8455b56..43cad589 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "920b29babc4f4e490170b73aba2f9de86e0a08b6" + "version": "9a1c77489e72ef80fcd29bd11d10d173db78b6c8" }, { "name": "ksonnet", diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c519a918..b939df68 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -67,3 +67,10 @@ rules: - subjectaccessreviews verbs: - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch From 7e055479d6b22dda1ebdf8e687f7fc8fc7654a3e Mon Sep 17 00:00:00 2001 From: Robert Nemeti Date: Wed, 12 Dec 2018 16:17:17 +0000 Subject: [PATCH 510/638] add namespace to the component selectors in order to select the right components in case multiple operators are running in the same cluster --- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index f59cc80f..8f8bbf02 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -32,9 +32,9 @@ local configMapList = k.core.v1.configMapList; coreDNSSelector: 'job="kube-dns"', podLabel: 'pod', - alertmanagerSelector: 'job="alertmanager-main"', - prometheusSelector: 'job="prometheus-k8s"', - prometheusOperatorSelector: 'job="prometheus-operator"', + alertmanagerSelector: 'job="alertmanager-main",namespace="' + $._config.namespace + '"', + prometheusSelector: 'job="prometheus-' + $._config.prometheus.name + '",namespace="' + $._config.namespace + '"', + prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + $._config.namespace + '"', jobs: { Kubelet: $._config.kubeletSelector, From 63c67f87501614da487bb67c2ee252f1e62f5b12 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 14 Dec 2018 12:03:12 +0100 Subject: [PATCH 511/638] contrib/kube-prometheus: bump k8s-adapter to fix bug --- .../prometheus-adapter/prometheus-adapter.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 8a67e887..65e2e54c 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheusAdapter: 'v0.4.0', + prometheusAdapter: 'v0.4.1', }, imageRepos+:: { From c82bae5b558572bc7b0107eeb95bfa399c4ed7a3 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Fri, 14 Dec 2018 12:08:06 +0100 Subject: [PATCH 512/638] contrib/kube-prometheus: regenerate --- jsonnetfile.lock.json | 2 +- manifests/prometheus-adapter-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 43cad589..c77cec21 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9a1c77489e72ef80fcd29bd11d10d173db78b6c8" + "version": "360f5ad835e12f3627a45f2df0487f73ec8969e3" }, { "name": "ksonnet", diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index 8772a7a5..f575fc6c 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -25,7 +25,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.0 + image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1 name: prometheus-adapter ports: - containerPort: 6443 From e6c43d286de60ad4941e6a66dccde270720a19e7 Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Tue, 18 Dec 2018 12:26:03 +0000 Subject: [PATCH 513/638] contrib/kube-prometheus: Add jobLabel to ServiceMonitorCoreDNS --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index c745f1c4..82c4fc90 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -391,6 +391,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, }, spec: { + jobLabel: 'k8s-app', selector: { matchLabels: { 'k8s-app': 'kube-dns', From 688629392a5cce18f201742299e8639dd95f5246 Mon Sep 17 00:00:00 2001 From: Lentil1016 Date: Mon, 7 Jan 2019 18:53:00 +0800 Subject: [PATCH 514/638] Update grafana revision to update prometheus datasource configuration. --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index c77cec21..87e21658 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "3df5e36ecbf348a13e155e12c495ac9fd05030b6" + "version": "3cab96409b2b4b8e8a87f768f1e2b063b1df7038" }, { "name": "prometheus-operator", From cb149a10bc0ac5209a8508e47b8d90d5063f4d6c Mon Sep 17 00:00:00 2001 From: BennX Date: Tue, 8 Jan 2019 15:57:40 +0100 Subject: [PATCH 515/638] Add SessionAffinity ClusterIP to prometheus service. Adds SessionAffinity ClusterIP to the prometheus service to prevent grafana from querying different instances when loading/updating metrics. --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 82c4fc90..89d55bcc 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -38,6 +38,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local prometheusPort = servicePort.newNamed('web', 9090, 'web'); service.new('prometheus-' + $._config.prometheus.name, { app: 'prometheus', prometheus: $._config.prometheus.name }, prometheusPort) + + service.mixin.spec.withSessionAffinity('ClientIP') + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), [if $._config.prometheus.rules != null && $._config.prometheus.rules != {} then 'rules']: From 7261eee19cf41bf78d741f6054a4d35ba451f601 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 8 Jan 2019 16:16:09 +0100 Subject: [PATCH 516/638] kube-promethes: Bump prometheus-operator --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index bae1e245..652d47a1 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.26.0" + "version": "v0.27.0" }, { "name": "etcd-mixin", From 31bb1a93538358b8241229bc75ebd29e61bf0dea Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 8 Jan 2019 16:18:57 +0100 Subject: [PATCH 517/638] *: re-generate kube-prometheus --- jsonnetfile.lock.json | 12 +- ...0alertmanagerCustomResourceDefinition.yaml | 6 + ...r-0prometheusCustomResourceDefinition.yaml | 27 + .../0prometheus-operator-deployment.yaml | 4 +- manifests/grafana-dashboardDefinitions.yaml | 557 +++++++++++++++++- manifests/grafana-deployment.yaml | 6 + manifests/prometheus-rules.yaml | 24 +- .../prometheus-serviceMonitorCoreDNS.yaml | 1 + 8 files changed, 617 insertions(+), 20 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 87e21658..d3dac64f 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "360f5ad835e12f3627a45f2df0487f73ec8969e3" + "version": "8582fcce481d4302a9c29e9b5e4182c5d98d67ed" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "64a06754786ca1a28929b9fb05c381085dcdd44c" + "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" + "version": "d05a9bd3adfba39ed3b1987a428d9e9863428df5" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "d60a39a5c01f651fdfef2db7a710bb5319b0dbc4" + "version": "c6932cf90bce4fef218b4308effc9f15c4219a01" }, { "name": "grafana", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "72ec4b9b16ef11700724dc71fec77112536eed40" + "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "dedae6eb7c253635e70403a1fb04842700277b23" + "version": "b04633fd8e67c65d4fe4929333fb4856f25da189" } ] } diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index d5c94fc9..89748f1a 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1378,6 +1378,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Alertmanager is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index d825277e..627ce96d 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1550,6 +1550,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Prometheus is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries @@ -1863,6 +1869,21 @@ spec: priorityClassName: description: Priority class assigned to the Pods type: string + query: + description: QuerySpec defines the query command line flags when starting + Prometheus. + properties: + lookbackDelta: + description: The delta difference allowed for retrieving metrics + during expression evaluations. + type: string + maxConcurrency: + description: Number of concurrent queries that can be run at once. + format: int32 + type: integer + timeout: + description: Maximum time a query may take before being aborted. + type: string remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. @@ -2943,6 +2964,12 @@ spec: type: boolean required: - key + image: + description: Image if specified has precedence over baseImage, tag + and sha combinations. Specifying the version is still necessary + to ensure the Prometheus Operator knows what version of Thanos + is being configured. + type: string peers: description: Peers is a DNS name for Thanos to discover peers through. type: string diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 1ddbae2f..55ad9043 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.26.0 - image: quay.io/coreos/prometheus-operator:v0.26.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.27.0 + image: quay.io/coreos/prometheus-operator:v0.27.0 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e4364aa6..f2c1ba40 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5064,7 +5064,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", + "expr": "sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cpu}}", @@ -5775,7 +5775,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5866,7 +5866,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5921,6 +5921,198 @@ items: "title": "Dashboard Row", "titleSize": "h6", "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inodes used", + "refId": "A" + }, + { + "expr": "max(node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inodes free", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Inodes Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 13, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(\n (\n (\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Inodes Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, @@ -6012,6 +6204,349 @@ items: metadata: name: grafana-dashboard-nodes namespace: monitoring +- apiVersion: v1 + data: + persistentvolumesusage.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ Usage }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Volume Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kubelet_volume_stats_inodes_used{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ Usage }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Volume inodes Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}, exported_namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "PersistentVolumeClaim", + "multi": false, + "name": "volume", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Persistent Volumes", + "uid": "919b92a8e8041bd567af9edab12c840c", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-persistentvolumesusage + namespace: monitoring - apiVersion: v1 data: pods.json: |- @@ -6078,21 +6613,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6418,7 +6953,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -6579,7 +7114,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -6659,7 +7194,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -6739,7 +7274,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 6816ce2f..aa5814eb 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -60,6 +60,9 @@ spec: - mountPath: /grafana-dashboard-definitions/0/nodes name: grafana-dashboard-nodes readOnly: false + - mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage + name: grafana-dashboard-persistentvolumesusage + readOnly: false - mountPath: /grafana-dashboard-definitions/0/pods name: grafana-dashboard-pods readOnly: false @@ -97,6 +100,9 @@ spec: - configMap: name: grafana-dashboard-nodes name: grafana-dashboard-nodes + - configMap: + name: grafana-dashboard-persistentvolumesusage + name: grafana-dashboard-persistentvolumesusage - configMap: name: grafana-dashboard-pods name: grafana-dashboard-pods diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 01ca8ddb..05e0debf 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -264,6 +264,28 @@ spec: node_namespace_pod:kube_pod_info: ) record: node:node_net_saturation:sum_irate + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' - name: kube-prometheus-node-recording.rules rules: - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) @@ -680,7 +702,7 @@ spec: }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | - (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) / sum(rate(rest_client_requests_total[5m])) by (instance, job)) * 100 > 1 diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml index 14a24545..633aa18c 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/prometheus-serviceMonitorCoreDNS.yaml @@ -10,6 +10,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s port: metrics + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system From f7bb04a715e8948501e3d4f5e1d22a8704f3b17b Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 8 Jan 2019 16:26:43 +0100 Subject: [PATCH 518/638] *: re-generate --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index f2c1ba40..4900caae 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5775,7 +5775,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5866,7 +5866,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6613,21 +6613,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6953,7 +6953,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7114,7 +7114,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7194,7 +7194,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7274,7 +7274,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From 3ec88e526b4c9f28ab9a92051bced096d43541fc Mon Sep 17 00:00:00 2001 From: Cyril Jouve Date: Tue, 8 Jan 2019 19:48:24 +0100 Subject: [PATCH 519/638] add service monitor to grafana --- .../kube-prometheus/kube-prometheus.libsonnet | 25 +++++++++++++++++++ manifests/grafana-service.yaml | 2 ++ manifests/grafana-serviceMonitor.yaml | 12 +++++++++ 3 files changed, 39 insertions(+) create mode 100644 manifests/grafana-serviceMonitor.yaml diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index f59cc80f..b751ce71 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -15,7 +15,32 @@ local configMapList = k.core.v1.configMapList; namespace: k.core.v1.namespace.new($._config.namespace), }, grafana+:: { + local serviceLabels = { + app: 'grafana', + }, dashboardDefinitions: configMapList.new(super.dashboardDefinitions), + service+: { + labels+: serviceLabels, + }, + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'grafana', + namespace: $._config.namespace, + }, + spec: { + selector: { + matchLabels: serviceLabels, + }, + endpoints: [ + { + port: 'http', + interval: '15s', + }, + ], + }, + }, }, } + { _config+:: { diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 45f77a0d..09ec1f97 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -1,5 +1,7 @@ apiVersion: v1 kind: Service +labels: + app: grafana metadata: name: grafana namespace: monitoring diff --git a/manifests/grafana-serviceMonitor.yaml b/manifests/grafana-serviceMonitor.yaml new file mode 100644 index 00000000..7ede266a --- /dev/null +++ b/manifests/grafana-serviceMonitor.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: grafana + namespace: monitoring +spec: + endpoints: + - interval: 15s + port: http + selector: + matchLabels: + app: grafana From 7de14cd802d6dd041d7a665501c271dbe8fa0c0b Mon Sep 17 00:00:00 2001 From: Cyril Jouve Date: Tue, 8 Jan 2019 19:49:10 +0100 Subject: [PATCH 520/638] bump kube-prometheus --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index d3dac64f..b7150d97 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "8582fcce481d4302a9c29e9b5e4182c5d98d67ed" + "version": "d3b2645d2ace03b36ed7d86e5213c09c9e1bde67" }, { "name": "ksonnet", From c18d8a4d6ead19edb1b7c4017ec7fd30936a865a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 9 Jan 2019 10:00:18 +0100 Subject: [PATCH 521/638] kube-prometheus: Fix grafana label --- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index b751ce71..4ff18371 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -15,13 +15,7 @@ local configMapList = k.core.v1.configMapList; namespace: k.core.v1.namespace.new($._config.namespace), }, grafana+:: { - local serviceLabels = { - app: 'grafana', - }, dashboardDefinitions: configMapList.new(super.dashboardDefinitions), - service+: { - labels+: serviceLabels, - }, serviceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -31,7 +25,9 @@ local configMapList = k.core.v1.configMapList; }, spec: { selector: { - matchLabels: serviceLabels, + matchLabels: { + app: 'grafana', + }, }, endpoints: [ { From 90a2e58aa4542c226dab6b726c7f84e898199102 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 9 Jan 2019 11:34:06 +0100 Subject: [PATCH 522/638] contrib/kube-prometheus: Add linux node selector to adapter deployment --- .../prometheus-adapter/prometheus-adapter.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 65e2e54c..8624c94a 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -113,6 +113,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; deployment.mixin.metadata.withNamespace($._config.namespace) + deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + + deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) + deployment.mixin.spec.template.spec.withVolumes([ From af8ac93d527995267383318c284b1e332ef675b6 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 9 Jan 2019 11:52:16 +0100 Subject: [PATCH 523/638] contrib/kube-prometheus: regenerate --- jsonnetfile.lock.json | 8 ++++---- manifests/grafana-deployment.yaml | 2 ++ manifests/grafana-service.yaml | 4 ++-- manifests/prometheus-adapter-deployment.yaml | 2 ++ manifests/prometheus-service.yaml | 1 + 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b7150d97..aa834a29 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "d3b2645d2ace03b36ed7d86e5213c09c9e1bde67" + "version": "e26f80aca8c9b021245ed3a62bb6c4d23be25786" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "c6932cf90bce4fef218b4308effc9f15c4219a01" + "version": "d75c3b260c1077c924d7ea0240250afc235c4cb3" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "3cab96409b2b4b8e8a87f768f1e2b063b1df7038" + "version": "9ddf5a198b0f7c898dc061158ea427112acbae11" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "b04633fd8e67c65d4fe4929333fb4856f25da189" + "version": "1e42503bea073b559fca682219242a801cf4d587" } ] } diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index aa5814eb..60ffc3b4 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -69,6 +69,8 @@ spec: - mountPath: /grafana-dashboard-definitions/0/statefulset name: grafana-dashboard-statefulset readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 09ec1f97..3acdf1e8 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -1,8 +1,8 @@ apiVersion: v1 kind: Service -labels: - app: grafana metadata: + labels: + app: grafana name: grafana namespace: monitoring spec: diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index f575fc6c..b0c20198 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -39,6 +39,8 @@ spec: - mountPath: /etc/adapter name: config readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux serviceAccountName: prometheus-adapter volumes: - emptyDir: {} diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml index 85b007f8..4f61e88a 100644 --- a/manifests/prometheus-service.yaml +++ b/manifests/prometheus-service.yaml @@ -13,3 +13,4 @@ spec: selector: app: prometheus prometheus: k8s + sessionAffinity: ClientIP From 1a9f1ddeb3886b92599eae515af702d46ddb81d9 Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Tue, 18 Dec 2018 17:13:43 +0000 Subject: [PATCH 524/638] contrib/kube-prometheus: Add kops CoreDNS prometheus discovery service --- README.md | 11 +++++++++++ examples/jsonnet-snippets/kops-coredns.jsonnet | 3 +++ .../kube-prometheus-kops-coredns.libsonnet | 13 +++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 examples/jsonnet-snippets/kops-coredns.jsonnet create mode 100644 jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet diff --git a/README.md b/README.md index b8f69a8b..5901d192 100644 --- a/README.md +++ b/README.md @@ -359,6 +359,17 @@ kops: (import 'kube-prometheus/kube-prometheus-kops.libsonnet') ``` +kops with CoreDNS: + +If your kops cluster is using CoreDNS, there is an additional mixin to import. + +[embedmd]:# (examples/jsonnet-snippets/kops-coredns.jsonnet) +```jsonnet +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') +``` + kubespray: [embedmd]:# (examples/jsonnet-snippets/kubespray.jsonnet) diff --git a/examples/jsonnet-snippets/kops-coredns.jsonnet b/examples/jsonnet-snippets/kops-coredns.jsonnet new file mode 100644 index 00000000..4988ef0c --- /dev/null +++ b/examples/jsonnet-snippets/kops-coredns.jsonnet @@ -0,0 +1,3 @@ +(import 'kube-prometheus/kube-prometheus.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet new file mode 100644 index 00000000..4c610872 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet @@ -0,0 +1,13 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+:: { + kubeDnsPrometheusDiscoveryService: + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 9153, 9153)]) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + + service.mixin.spec.withClusterIp('None'), + }, +} From 9173d17e7adf39f66159ddcb6d3e9a1e02e2f3a5 Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Mon, 7 Jan 2019 17:15:54 +0000 Subject: [PATCH 525/638] Bump kube-prometheus version in jsonnetfile.lock.json --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index aa834a29..35dd0100 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e26f80aca8c9b021245ed3a62bb6c4d23be25786" + "version": "ef7b7f4941e117223cbef3e278b9161736a302a5" }, { "name": "ksonnet", @@ -81,4 +81,4 @@ "version": "1e42503bea073b559fca682219242a801cf4d587" } ] -} +} \ No newline at end of file From 1fe933bb66b8e401044d873bd9623f672560e3c9 Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Wed, 9 Jan 2019 12:25:05 +0000 Subject: [PATCH 526/638] Fix jsonnetfile.lock.json --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 35dd0100..71ba5413 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ef7b7f4941e117223cbef3e278b9161736a302a5" + "version": "0d011e48ab48a9e624347d82b7f8ecdcfe6bb5dd" }, { "name": "ksonnet", @@ -81,4 +81,4 @@ "version": "1e42503bea073b559fca682219242a801cf4d587" } ] -} \ No newline at end of file +} From ea4393e655666533fdcd362f5455a27e247e5013 Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Wed, 9 Jan 2019 13:10:13 +0000 Subject: [PATCH 527/638] Bump jsonnetfile.lock.json again --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 71ba5413..597a93f0 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "0d011e48ab48a9e624347d82b7f8ecdcfe6bb5dd" + "version": "acd31e80dcddd7ce11fd3b715d5bfe179a95474e" }, { "name": "ksonnet", @@ -81,4 +81,4 @@ "version": "1e42503bea073b559fca682219242a801cf4d587" } ] -} +} \ No newline at end of file From 6bea0f66b59b31cd73cce33cca8a3d4c3214f3d8 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 9 Jan 2019 15:06:53 +0100 Subject: [PATCH 528/638] contrib/kube-prometheus: Update kube-prometheus-thanos for thanos v0.2 --- .../kube-prometheus/kube-prometheus-thanos.libsonnet | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index 28b76dff..d809e493 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -2,15 +2,20 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local service = k.core.v1.service; local servicePort = k.core.v1.service.mixin.spec.portsType; - { _config+:: { versions+:: { - thanos: 'v0.1.0', + thanos: 'v0.2.1', }, imageRepos+:: { thanos: 'improbable/thanos', }, + thanos+:: { + objectStorageConfig: { + key: 'thanos.yaml', # How the file inside the secret is called + name: 'thanos-objstore-config', # This is the name of your Kubernetes secret with the config + }, + }, }, prometheus+:: { prometheus+: { @@ -22,6 +27,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; peers: 'thanos-peers.' + $._config.namespace + '.svc:10900', version: $._config.versions.thanos, baseImage: $._config.imageRepos.thanos, + objectStorageConfig: $._config.thanos.objectStorageConfig, }, }, }, From 73bd58f8d8d3bf59b1fa6aead6daccfcc7f12a51 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 9 Jan 2019 16:01:00 +0000 Subject: [PATCH 529/638] Remove accidental whitespace jsonnet-snippets/kops-coredns.jsonnet Co-Authored-By: BrianChristie --- examples/jsonnet-snippets/kops-coredns.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/jsonnet-snippets/kops-coredns.jsonnet b/examples/jsonnet-snippets/kops-coredns.jsonnet index 4988ef0c..6e308579 100644 --- a/examples/jsonnet-snippets/kops-coredns.jsonnet +++ b/examples/jsonnet-snippets/kops-coredns.jsonnet @@ -1,3 +1,3 @@ (import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/kube-prometheus-kops.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') +(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') From bc810b2ab36271a72cbc0cbbc546845397e69e54 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 10 Jan 2019 15:32:51 +0100 Subject: [PATCH 530/638] contrib/kube-prometheus: Move thanosPeerService up for better ordering --- .../kube-prometheus/kube-prometheus-thanos.libsonnet | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index d809e493..3dd55df4 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -31,6 +31,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; }, }, }, + thanosPeerService: + local thanosPeerPort = servicePort.newNamed('cluster', 10900, 'cluster'); + service.new('thanos-peers', { 'thanos-peer': 'true' }, thanosPeerPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.spec.withType('ClusterIP') + + service.mixin.spec.withClusterIp('None'), thanosQueryDeployment: local deployment = k.apps.v1beta2.deployment; local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; @@ -60,11 +66,5 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; service.new('thanos-query', { app: 'thanos-query' }, thanosQueryPort) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ app: 'thanos-query' }), - thanosPeerService: - local thanosPeerPort = servicePort.newNamed('cluster', 10900, 'cluster'); - service.new('thanos-peers', { 'thanos-peer': 'true' }, thanosPeerPort) + - service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.spec.withType('ClusterIP') + - service.mixin.spec.withClusterIp('None'), }, } From abd6ee5203b6da4f9f058ce01c89bbc5c41f02fb Mon Sep 17 00:00:00 2001 From: Jono MacDougall Date: Thu, 10 Jan 2019 14:54:35 +0000 Subject: [PATCH 531/638] Added ability to configure nodeExporter port --- README.md | 4 ++++ .../node-exporter/node-exporter.libsonnet | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5901d192..bbc0d0a7 100644 --- a/README.md +++ b/README.md @@ -312,6 +312,10 @@ These are the available fields with their respective default values: cpuPerNode: '2m', memoryPerNode: '30Mi', }, + + nodeExporter+:: { + port: 9100, + }, }, } ``` diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index c9e1faeb..d7f77bf9 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -13,6 +13,10 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; nodeExporter: 'quay.io/prometheus/node-exporter', kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', }, + + nodeExporter+:: { + port: 9100, + }, }, nodeExporter+:: { @@ -83,7 +87,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local nodeExporter = container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + container.withArgs([ - '--web.listen-address=127.0.0.1:9100', + '--web.listen-address=127.0.0.1:' + $._config.nodeExporter.port, '--path.procfs=/host/proc', '--path.sysfs=/host/sys', @@ -101,8 +105,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local proxy = container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ - '--secure-listen-address=$(IP):9100', - '--upstream=http://127.0.0.1:9100/', + '--secure-listen-address=$(IP):' + $._config.nodeExporter.port, + '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', ]) + // Keep `hostPort` here, rather than in the node-exporter container // because Kubernetes mandates that if you define a `hostPort` then @@ -112,7 +116,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; // used by the service is tied to the proxy container. We *could* // forgo declaring the host port, however it is important to declare // it so that the scheduler can decide if the pod is schedulable. - container.withPorts(containerPort.new(9100) + containerPort.withHostPort(9100) + containerPort.withName('https')) + + container.withPorts(containerPort.new($._config.nodeExporter.port) + containerPort.withHostPort($._config.nodeExporter.port) + containerPort.withName('https')) + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }) + container.withEnv([ip]); @@ -177,7 +181,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local service = k.core.v1.service; local servicePort = k.core.v1.service.mixin.spec.portsType; - local nodeExporterPort = servicePort.newNamed('https', 9100, 'https'); + local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https'); service.new('node-exporter', $.nodeExporter.daemonset.spec.selector.matchLabels, nodeExporterPort) + service.mixin.metadata.withNamespace($._config.namespace) + From 07813fdc5b3e941f70e25cddd985ddd7a913b52a Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 10 Jan 2019 16:04:50 +0100 Subject: [PATCH 532/638] contrib/kube-prometheus: Add Thanos store StatefulSet --- .../kube-prometheus-thanos.libsonnet | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index 3dd55df4..e37df7b7 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -12,8 +12,8 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; }, thanos+:: { objectStorageConfig: { - key: 'thanos.yaml', # How the file inside the secret is called - name: 'thanos-objstore-config', # This is the name of your Kubernetes secret with the config + key: 'thanos.yaml', // How the file inside the secret is called + name: 'thanos-objstore-config', // This is the name of your Kubernetes secret with the config }, }, }, @@ -66,5 +66,47 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; service.new('thanos-query', { app: 'thanos-query' }, thanosQueryPort) + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ app: 'thanos-query' }), + + thanosStoreStatefulset: + local statefulSet = k.apps.v1beta2.statefulSet; + local volume = statefulSet.mixin.spec.template.spec.volumesType; + local container = statefulSet.mixin.spec.template.spec.containersType; + local containerEnv = container.envType; + local containerVolumeMount = container.volumeMountsType; + + local labels = { app: 'thanos', 'thanos-peer': 'true' }; + + local c = + container.new('thanos-store', $._config.imageRepos.thanos + ':' + $._config.versions.thanos) + + container.withArgs([ + 'store', + '--log.level=debug', + '--data-dir=/var/thanos/store', + '--cluster.peers=thanos-peers.' + $._config.namespace + '.svc:10900', + '--objstore.config=$(OBJSTORE_CONFIG)', + ]) + + container.withEnv([ + containerEnv.fromSecretRef( + 'OBJSTORE_CONFIG', + $._config.thanos.objectStorageConfig.name, + $._config.thanos.objectStorageConfig.key, + ), + ]) + + container.withPorts([ + { name: 'cluster', containerPort: 10900 }, + { name: 'grpc', containerPort: 10901 }, + { name: 'http', containerPort: 10902 }, + ]) + + container.withVolumeMounts([ + containerVolumeMount.new('data', '/var/thanos/store', false), + ]); + + statefulSet.new('thanos-store', 1, c, [], labels) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.spec.selector.withMatchLabels(labels) + + statefulSet.mixin.spec.withServiceName('thanos-store') + + statefulSet.mixin.spec.template.spec.withVolumes([ + volume.fromEmptyDir('data'), + ]), }, } From 1d4a8b9f43dfd2dab4a577b421fefaedc1325950 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 11 Jan 2019 11:25:17 +0100 Subject: [PATCH 533/638] contrib/kube-prometheus: Bump kube-state-metrics to v1.5.0 --- .../kube-state-metrics/kube-state-metrics.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 153d7b08..21600af1 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -16,7 +16,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, versions+:: { - kubeStateMetrics: 'v1.4.0', + kubeStateMetrics: 'v1.5.0', kubeRbacProxy: 'v0.4.0', addonResizer: '1.0', }, From dce8ce54060f6b26ceb7dae3d25bc58c1fd0e9b3 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 11 Jan 2019 11:28:17 +0100 Subject: [PATCH 534/638] contrib/kube-prometheus: Generate with kube-state-metrics v1.5.0 --- jsonnetfile.lock.json | 2 +- manifests/kube-state-metrics-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 597a93f0..00b8c09d 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "acd31e80dcddd7ce11fd3b715d5bfe179a95474e" + "version": "b05cbfcdb97236424807dfc75a92d7a56aa25ff5" }, { "name": "ksonnet", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 87aa40af..fac02f07 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -51,7 +51,7 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.4.0 + image: quay.io/coreos/kube-state-metrics:v1.5.0 name: kube-state-metrics resources: limits: From 0b55989aef8aaa5b4fb7d5536cabfa5a7c925b41 Mon Sep 17 00:00:00 2001 From: liuxu Date: Fri, 11 Jan 2019 15:16:46 +0800 Subject: [PATCH 535/638] fix node_filesystem only collect mountpoint '/' metrics --- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 1 + manifests/node-exporter-daemonset.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index d7f77bf9..69025a59 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -90,6 +90,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--web.listen-address=127.0.0.1:' + $._config.nodeExporter.port, '--path.procfs=/host/proc', '--path.sysfs=/host/sys', + '--path.rootfs=/host/root', // The following settings have been taken from // https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31 diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 28d424b8..6717bc90 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -19,6 +19,7 @@ spec: - --web.listen-address=127.0.0.1:9100 - --path.procfs=/host/proc - --path.sysfs=/host/sys + - --path.rootfs=/host/root - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ image: quay.io/prometheus/node-exporter:v0.16.0 From 7934b9babfa5e22b9fa789ab6b835925c36f0ab9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 14 Jan 2019 13:14:38 +0100 Subject: [PATCH 536/638] kube-prometheus: bump dependencies --- jsonnetfile.lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 00b8c09d..cc9d2f4e 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "b05cbfcdb97236424807dfc75a92d7a56aa25ff5" + "version": "e8438503ba627ac08646c6d0a9ac31aa0efa8506" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1e42503bea073b559fca682219242a801cf4d587" + "version": "1eee465a43720d713bb69f7b7f5e120135fdb1ac" } ] -} \ No newline at end of file +} From 5cf1053f136002d0a8e8252a42ca8b9cb1eb96eb Mon Sep 17 00:00:00 2001 From: Brian Christie Date: Mon, 14 Jan 2019 16:43:45 +0000 Subject: [PATCH 537/638] Fix node exporter crashlooping --- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 69025a59..ee0c3e99 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - nodeExporter: 'v0.16.0', + nodeExporter: 'v0.17.0', kubeRbacProxy: 'v0.4.0', }, From 38a200eaa937fc90b867615c577ea72cbe106b90 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 14 Jan 2019 18:19:15 +0100 Subject: [PATCH 538/638] contrib/kube-prometheus: Add ServiceMonitor for thanos-peers (prom, store, query) --- .../kube-prometheus-thanos.libsonnet | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index e37df7b7..71d342dc 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -21,7 +21,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; prometheus+: { spec+: { podMetadata+: { - labels+: { 'thanos-peer': 'true' }, + labels+: { 'thanos-peers': 'true' }, }, thanos+: { peers: 'thanos-peers.' + $._config.namespace + '.svc:10900', @@ -32,11 +32,41 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; }, }, thanosPeerService: - local thanosPeerPort = servicePort.newNamed('cluster', 10900, 'cluster'); - service.new('thanos-peers', { 'thanos-peer': 'true' }, thanosPeerPort) + + service.new('thanos-peers', { 'thanos-peers': 'true' }, [ + servicePort.newNamed('cluster', 10900, 'cluster'), + servicePort.newNamed('http', 10902, 'http'), + ]) + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ 'thanos-peers': 'true' }) + service.mixin.spec.withType('ClusterIP') + service.mixin.spec.withClusterIp('None'), + + serviceMonitorThanosPeer: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'thanos-peers', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'thanos-peers', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'http', + interval: '30s', + }, + ], + selector: { + matchLabels: { + 'thanos-peers': 'true', + }, + }, + }, + }, thanosQueryDeployment: local deployment = k.apps.v1beta2.deployment; local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; @@ -55,7 +85,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; '--query.replica-label=prometheus_replica', '--cluster.peers=thanos-peers.' + $._config.namespace + '.svc:10900', ]); - local podLabels = { app: 'thanos-query', 'thanos-peer': 'true' }; + local podLabels = { app: 'thanos-query', 'thanos-peers': 'true' }; deployment.new('thanos-query', 1, thanosQueryContainer, podLabels) + deployment.mixin.metadata.withNamespace($._config.namespace) + deployment.mixin.metadata.withLabels(podLabels) + @@ -74,7 +104,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; local containerEnv = container.envType; local containerVolumeMount = container.volumeMountsType; - local labels = { app: 'thanos', 'thanos-peer': 'true' }; + local labels = { app: 'thanos', 'thanos-peers': 'true' }; local c = container.new('thanos-store', $._config.imageRepos.thanos + ':' + $._config.versions.thanos) + From 0423a7ff81b0503b581cef38428f5e17d7951ae0 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak Date: Wed, 16 Jan 2019 15:39:27 +0100 Subject: [PATCH 539/638] kube-prometheus: regenerate --- jsonnetfile.lock.json | 2 +- manifests/node-exporter-daemonset.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index cc9d2f4e..ba4cb47e 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e8438503ba627ac08646c6d0a9ac31aa0efa8506" + "version": "0714fe498018819fa790fc040319210d00e46e7b" }, { "name": "ksonnet", diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 6717bc90..4dea336b 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -22,7 +22,7 @@ spec: - --path.rootfs=/host/root - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ - image: quay.io/prometheus/node-exporter:v0.16.0 + image: quay.io/prometheus/node-exporter:v0.17.0 name: node-exporter resources: limits: From 6252f6e2afe2bdd69b333dd914b2f22170f46938 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 17 Jan 2019 14:13:24 +0100 Subject: [PATCH 540/638] kube-prometheus: Disable known insecure TLS cipher suites --- .../kube-prometheus/kube-prometheus.libsonnet | 25 +++++++++++++ .../kube-state-metrics.libsonnet | 37 ++++++++++++++++--- .../node-exporter/node-exporter.libsonnet | 1 + 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 4ff18371..7dae5f38 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -42,6 +42,31 @@ local configMapList = k.core.v1.configMapList; _config+:: { namespace: 'default', + tlsCipherSuites: [ + // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + 'TLS_RSA_WITH_AES_128_CBC_SHA', + 'TLS_RSA_WITH_AES_256_CBC_SHA', + 'TLS_RSA_WITH_AES_128_CBC_SHA256', + 'TLS_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_RSA_WITH_AES_256_GCM_SHA384', + // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', + // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + ], + cadvisorSelector: 'job="kubelet"', kubeletSelector: 'job="kubelet"', kubeStateMetricsSelector: 'job="kube-state-metrics"', diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 21600af1..4a9ee58f 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -4,6 +4,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', + tlsCipherSuites: [ + // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + 'TLS_RSA_WITH_AES_128_CBC_SHA', + 'TLS_RSA_WITH_AES_256_CBC_SHA', + 'TLS_RSA_WITH_AES_128_CBC_SHA256', + 'TLS_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_RSA_WITH_AES_256_GCM_SHA384', + // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', + // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + ], + kubeStateMetrics+:: { collectors: '', // empty string gets a default set scrapeInterval: '30s', @@ -110,11 +135,11 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; rulesType.withVerbs(['create']); local policyRule = rulesType.new() + - rulesType.withApiGroups(['policy']) + - rulesType.withResources([ - 'poddisruptionbudgets', - ]) + - rulesType.withVerbs(['list', 'watch']); + rulesType.withApiGroups(['policy']) + + rulesType.withResources([ + 'poddisruptionbudgets', + ]) + + rulesType.withVerbs(['list', 'watch']); local rules = [coreRule, extensionsRule, appsRule, batchRule, autoscalingRule, authenticationRole, authorizationRole, policyRule]; @@ -135,6 +160,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.new('kube-rbac-proxy-main', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ '--secure-listen-address=:8443', + '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:8081/', ]) + container.withPorts(containerPort.newNamed('https-main', 8443)) + @@ -145,6 +171,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.new('kube-rbac-proxy-self', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ '--secure-listen-address=:9443', + '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:8082/', ]) + container.withPorts(containerPort.newNamed('https-self', 9443)) + diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index ee0c3e99..7030d2f9 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -107,6 +107,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ '--secure-listen-address=$(IP):' + $._config.nodeExporter.port, + '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', ]) + // Keep `hostPort` here, rather than in the node-exporter container From 27b5a0d4e72315b56a22d2a84a3ce6a307a1627c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 17 Jan 2019 15:27:25 +0100 Subject: [PATCH 541/638] kube-prometheus: Bump kube-prometheus jsonnet dependency --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index ba4cb47e..daec417f 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "0714fe498018819fa790fc040319210d00e46e7b" + "version": "3e7bb3da22597e0a2b7ed0a7b6e0f6a82796eb5c" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1eee465a43720d713bb69f7b7f5e120135fdb1ac" + "version": "a00bff7848db1dcead692e2bea1d7c87e8a2c157" } ] } From 63746f59333aab8bf86bafc9ae40458165af5863 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 17 Jan 2019 15:36:43 +0100 Subject: [PATCH 542/638] kube-prometheus: re-generate --- manifests/kube-state-metrics-deployment.yaml | 2 ++ manifests/node-exporter-daemonset.yaml | 1 + 2 files changed, 3 insertions(+) diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index fac02f07..1e8fc1e2 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -18,6 +18,7 @@ spec: containers: - args: - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8081/ image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy-main @@ -33,6 +34,7 @@ spec: memory: 20Mi - args: - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8082/ image: quay.io/coreos/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy-self diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 4dea336b..c3de3679 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -44,6 +44,7 @@ spec: readOnly: true - args: - --secure-listen-address=$(IP):9100 + - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:9100/ env: - name: IP From 853b3cf6327cb63e912cf53e912bb217f20ead5c Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 15 Jan 2019 18:54:33 +0100 Subject: [PATCH 543/638] contrib/kube-prometheus: Add Thanos compactor as statefulset --- .../kube-prometheus-thanos.libsonnet | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index 71d342dc..53c2e614 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -138,5 +138,44 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; statefulSet.mixin.spec.template.spec.withVolumes([ volume.fromEmptyDir('data'), ]), + + thanosCompactorStatefulset: + local statefulSet = k.apps.v1beta2.statefulSet; + local volume = statefulSet.mixin.spec.template.spec.volumesType; + local container = statefulSet.mixin.spec.template.spec.containersType; + local containerEnv = container.envType; + local containerVolumeMount = container.volumeMountsType; + + local labels = { app: 'thanos', 'thanos-peers': 'true' }; + + local c = + container.new('thanos-compactor', $._config.imageRepos.thanos + ':' + $._config.versions.thanos) + + container.withArgs([ + 'compact', + '--log.level=debug', + '--data-dir=/var/thanos/store', + '--objstore.config=$(OBJSTORE_CONFIG)', + ]) + + container.withEnv([ + containerEnv.fromSecretRef( + 'OBJSTORE_CONFIG', + $._config.thanos.objectStorageConfig.name, + $._config.thanos.objectStorageConfig.key, + ), + ]) + + container.withPorts([ + { name: 'http', containerPort: 10902 }, + ]) + + container.withVolumeMounts([ + containerVolumeMount.new('data', '/var/thanos/store', false), + ]); + + statefulSet.new('thanos-compactor', 1, c, [], labels) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.spec.selector.withMatchLabels(labels) + + statefulSet.mixin.spec.withServiceName('thanos-compactor') + + statefulSet.mixin.spec.template.spec.withVolumes([ + volume.fromEmptyDir('data'), + ]), }, } From e10cafcc25bdcdd99acedf38e52b7fe34a232883 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 16 Jan 2019 16:56:36 +0100 Subject: [PATCH 544/638] contrib/kube-prometheus: Create Service and ServiceMonitor for Thanos compactor --- .../kube-prometheus-thanos.libsonnet | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index 53c2e614..cc6ec460 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -139,6 +139,42 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; volume.fromEmptyDir('data'), ]), + serviceMonitorThanosCompactor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'thanos-compactor', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'thanos-compactor', + }, + }, + spec: { + jobLabel: 'k8s-app', + endpoints: [ + { + port: 'http', + interval: '30s', + }, + ], + selector: { + matchLabels: { + app: 'thanos-compactor', + }, + }, + }, + }, + + thanosCompactorService: + service.new( + 'thanos-compactor', + { app: 'thanos-compactor' }, + servicePort.newNamed('http', 9090, 'http'), + ) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ app: 'thanos-compactor' }), + thanosCompactorStatefulset: local statefulSet = k.apps.v1beta2.statefulSet; local volume = statefulSet.mixin.spec.template.spec.volumesType; @@ -146,7 +182,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; local containerEnv = container.envType; local containerVolumeMount = container.volumeMountsType; - local labels = { app: 'thanos', 'thanos-peers': 'true' }; + local labels = { app: 'thanos-compactor' }; local c = container.new('thanos-compactor', $._config.imageRepos.thanos + ':' + $._config.versions.thanos) + @@ -155,6 +191,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; '--log.level=debug', '--data-dir=/var/thanos/store', '--objstore.config=$(OBJSTORE_CONFIG)', + '--wait', ]) + container.withEnv([ containerEnv.fromSecretRef( From 79712b6b950bbd0f69892d56c3fb7f2b83c839fd Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 17 Jan 2019 16:27:57 +0100 Subject: [PATCH 545/638] kube-prometheus: Remove duplicate TLS cipher suites config --- .../kube-state-metrics.libsonnet | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 4a9ee58f..30a176f5 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -4,31 +4,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', - tlsCipherSuites: [ - // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - 'TLS_RSA_WITH_AES_128_CBC_SHA', - 'TLS_RSA_WITH_AES_256_CBC_SHA', - 'TLS_RSA_WITH_AES_128_CBC_SHA256', - 'TLS_RSA_WITH_AES_128_GCM_SHA256', - 'TLS_RSA_WITH_AES_256_GCM_SHA384', - // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', - 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', - // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', - 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', - 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', - 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', - 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', - 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', - ], - kubeStateMetrics+:: { collectors: '', // empty string gets a default set scrapeInterval: '30s', From ac1eda0b1a98a348b81223256544bfd9f4532dba Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 17 Jan 2019 16:31:05 +0100 Subject: [PATCH 546/638] kube-prometheus: Bump kube-prometheus jsonnet dependency --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index daec417f..5c6929a4 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3e7bb3da22597e0a2b7ed0a7b6e0f6a82796eb5c" + "version": "0905fc6d7d74f1c89a88e966ef7d29c55bcf1f35" }, { "name": "ksonnet", From 6209f3620c909c6dce7a21ae7366140fb1557fcc Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 17 Jan 2019 18:45:39 +0100 Subject: [PATCH 547/638] Improve the Thanos documentation for Thanos v0.2 --- example.jsonnet | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 2a10509c..fcd2bb01 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -1,8 +1,9 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, -}; +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + From e3b989ccd36168c702ed7169e1ed54e60165d227 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Wed, 16 Jan 2019 01:36:32 +0530 Subject: [PATCH 548/638] Add prometheus admin flag feature to helm, kube-prometheus and jsonnet files --- README.md | 3 +++ example.jsonnet | 4 ++++ jsonnet/kube-prometheus/kube-prometheus.libsonnet | 3 +++ jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 ++ ...ometheus-operator-0prometheusCustomResourceDefinition.yaml | 3 +++ manifests/prometheus-prometheus.yaml | 1 + 6 files changed, 16 insertions(+) diff --git a/README.md b/README.md index bbc0d0a7..83b1a635 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,8 @@ In order to update the kube-prometheus dependency, simply use the jsonnet-bundle e.g. of how to compile the manifests: `./build.sh example.jsonnet` +> before compiling, install `gojsontoyaml` tool with `go get github.com/brancz/gojsontoyaml` + Here's [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) @@ -279,6 +281,7 @@ These are the available fields with their respective default values: names: 'k8s', replicas: 2, rules: {}, + enableAdminApi: 'true', }, alertmanager+:: { diff --git a/example.jsonnet b/example.jsonnet index 2a10509c..aa4c50f1 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -2,6 +2,10 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { namespace: 'monitoring', }, + prometheus+:: { + name: 'k8s', + enableAdminApi: 'true', + }, }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 7dae5f38..915d363f 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -82,6 +82,8 @@ local configMapList = k.core.v1.configMapList; prometheusSelector: 'job="prometheus-k8s"', prometheusOperatorSelector: 'job="prometheus-operator"', + enableAdminApi: 'true', + jobs: { Kubelet: $._config.kubeletSelector, KubeScheduler: $._config.kubeSchedulerSelector, @@ -97,6 +99,7 @@ local configMapList = k.core.v1.configMapList; prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, + enableAdminApi: $._config.enableAdminApi, }, grafana+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 89d55bcc..047a6dd0 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -18,6 +18,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus+:: { name: 'k8s', + enableAdminApi: 'true', replicas: 2, rules: {}, renderedRules: {}, @@ -168,6 +169,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: $._config.prometheus.replicas, version: $._config.versions.prometheus, baseImage: $._config.imageRepos.prometheus, + enableAdminApi: $._config.prometheus.enableAdminApi, serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: {}, serviceMonitorNamespaceSelector: {}, diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 627ce96d..a9b3cdfa 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,6 +1538,9 @@ spec: required: - name type: array + enableAdminApi: + description: Enable access to prometheus web admin API. More info: https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis + type: boolean evaluationInterval: description: Interval between consecutive evaluations. type: string diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index c16914b0..18bdcf74 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -12,6 +12,7 @@ spec: namespace: monitoring port: web baseImage: quay.io/prometheus/prometheus + enableAdminApi: true nodeSelector: beta.kubernetes.io/os: linux replicas: 2 From 95771caf8c5bb77b82329cdc67357ea208a3ed95 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Wed, 16 Jan 2019 22:05:47 +0530 Subject: [PATCH 549/638] Remove updates to helm chart, fix typo in flag, add make generated files --- README.md | 2 +- example.jsonnet | 2 +- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 4 ++-- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 4 ++-- ...ometheus-operator-0prometheusCustomResourceDefinition.yaml | 2 +- manifests/prometheus-prometheus.yaml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 83b1a635..aa3b5674 100644 --- a/README.md +++ b/README.md @@ -281,7 +281,7 @@ These are the available fields with their respective default values: names: 'k8s', replicas: 2, rules: {}, - enableAdminApi: 'true', + enableAdminAPI: 'true', }, alertmanager+:: { diff --git a/example.jsonnet b/example.jsonnet index aa4c50f1..db480ea8 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -4,7 +4,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { }, prometheus+:: { name: 'k8s', - enableAdminApi: 'true', + enableAdminAPI: 'true', }, }; diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 915d363f..814cfe60 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -82,7 +82,7 @@ local configMapList = k.core.v1.configMapList; prometheusSelector: 'job="prometheus-k8s"', prometheusOperatorSelector: 'job="prometheus-operator"', - enableAdminApi: 'true', + enableAdminAPI: 'true', jobs: { Kubelet: $._config.kubeletSelector, @@ -99,7 +99,7 @@ local configMapList = k.core.v1.configMapList; prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, - enableAdminApi: $._config.enableAdminApi, + enableAdminAPI: $._config.enableAdminAPI, }, grafana+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 047a6dd0..b6ce301a 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -18,7 +18,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus+:: { name: 'k8s', - enableAdminApi: 'true', + enableAdminAPI: 'true', replicas: 2, rules: {}, renderedRules: {}, @@ -169,7 +169,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: $._config.prometheus.replicas, version: $._config.versions.prometheus, baseImage: $._config.imageRepos.prometheus, - enableAdminApi: $._config.prometheus.enableAdminApi, + enableAdminAPI: $._config.prometheus.enableAdminAPI, serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: {}, serviceMonitorNamespaceSelector: {}, diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index a9b3cdfa..abbf266f 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,7 +1538,7 @@ spec: required: - name type: array - enableAdminApi: + enableAdminAPI: description: Enable access to prometheus web admin API. More info: https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis type: boolean evaluationInterval: diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 18bdcf74..5abe34b3 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -12,7 +12,7 @@ spec: namespace: monitoring port: web baseImage: quay.io/prometheus/prometheus - enableAdminApi: true + enableAdminAPI: true nodeSelector: beta.kubernetes.io/os: linux replicas: 2 From 5ce011173f0871b5d6f403e84686381625769d19 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Thu, 17 Jan 2019 20:13:37 +0530 Subject: [PATCH 550/638] Add a clear warning, remove admin API from jsonnet example --- example.jsonnet | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index db480ea8..83b439f1 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -1,11 +1,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { namespace: 'monitoring', - }, - prometheus+:: { - name: 'k8s', - enableAdminAPI: 'true', - }, + } }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + From 85c4c2b8dfacfebf7000114c1d5d43002ff2d8e0 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Thu, 17 Jan 2019 20:21:05 +0530 Subject: [PATCH 551/638] Turn prometheus admin API off by default --- README.md | 2 +- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 2 +- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- manifests/prometheus-prometheus.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index aa3b5674..fd4836c3 100644 --- a/README.md +++ b/README.md @@ -281,7 +281,7 @@ These are the available fields with their respective default values: names: 'k8s', replicas: 2, rules: {}, - enableAdminAPI: 'true', + enableAdminAPI: 'false', }, alertmanager+:: { diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 814cfe60..3a25e52c 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -82,7 +82,7 @@ local configMapList = k.core.v1.configMapList; prometheusSelector: 'job="prometheus-k8s"', prometheusOperatorSelector: 'job="prometheus-operator"', - enableAdminAPI: 'true', + enableAdminAPI: 'false', jobs: { Kubelet: $._config.kubeletSelector, diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index b6ce301a..48bb6c0e 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -18,7 +18,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus+:: { name: 'k8s', - enableAdminAPI: 'true', + enableAdminAPI: 'false', replicas: 2, rules: {}, renderedRules: {}, diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 5abe34b3..2b830347 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -12,7 +12,7 @@ spec: namespace: monitoring port: web baseImage: quay.io/prometheus/prometheus - enableAdminAPI: true + enableAdminAPI: false nodeSelector: beta.kubernetes.io/os: linux replicas: 2 From fcda42b12315e4dd66c1cfa6dc4fa19547698dd4 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Thu, 17 Jan 2019 21:28:30 +0530 Subject: [PATCH 552/638] Remove api flag from jsonnet files --- README.md | 1 - example.jsonnet | 2 +- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 3 --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 - manifests/prometheus-prometheus.yaml | 1 - 5 files changed, 1 insertion(+), 7 deletions(-) diff --git a/README.md b/README.md index fd4836c3..55c6c29a 100644 --- a/README.md +++ b/README.md @@ -281,7 +281,6 @@ These are the available fields with their respective default values: names: 'k8s', replicas: 2, rules: {}, - enableAdminAPI: 'false', }, alertmanager+:: { diff --git a/example.jsonnet b/example.jsonnet index 83b439f1..2a10509c 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -1,7 +1,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { _config+:: { namespace: 'monitoring', - } + }, }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 3a25e52c..7dae5f38 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -82,8 +82,6 @@ local configMapList = k.core.v1.configMapList; prometheusSelector: 'job="prometheus-k8s"', prometheusOperatorSelector: 'job="prometheus-operator"', - enableAdminAPI: 'false', - jobs: { Kubelet: $._config.kubeletSelector, KubeScheduler: $._config.kubeSchedulerSelector, @@ -99,7 +97,6 @@ local configMapList = k.core.v1.configMapList; prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, - enableAdminAPI: $._config.enableAdminAPI, }, grafana+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 48bb6c0e..b9b3810c 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -18,7 +18,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheus+:: { name: 'k8s', - enableAdminAPI: 'false', replicas: 2, rules: {}, renderedRules: {}, diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 2b830347..c16914b0 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -12,7 +12,6 @@ spec: namespace: monitoring port: web baseImage: quay.io/prometheus/prometheus - enableAdminAPI: false nodeSelector: beta.kubernetes.io/os: linux replicas: 2 From 852eaff7f7148fb547a86cbd16d585d5e2844eb8 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 18 Jan 2019 16:01:45 +0100 Subject: [PATCH 553/638] contrib/kube-prometheus: Generate new updated docs --- README.md | 11 ++++++----- ...eloping-prometheus-rules-and-grafana-dashboards.md | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bbc0d0a7..a0ce40f0 100644 --- a/README.md +++ b/README.md @@ -147,11 +147,12 @@ Here's [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, -}; +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index 72deb0e3..a0c1ff76 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -10,11 +10,12 @@ As a basis, all examples in this guide are based on the base example of the kube [embedmd]:# (../example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, -}; +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + From 969106668a3587fcffdebfa4890aaf1099773d47 Mon Sep 17 00:00:00 2001 From: "Purandare, Aditya" Date: Fri, 18 Jan 2019 22:00:43 +0530 Subject: [PATCH 554/638] Add the missing file generated by make command --- ...rometheus-operator-0prometheusCustomResourceDefinition.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index abbf266f..627ce96d 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,9 +1538,6 @@ spec: required: - name type: array - enableAdminAPI: - description: Enable access to prometheus web admin API. More info: https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis - type: boolean evaluationInterval: description: Interval between consecutive evaluations. type: string From 52b87babd238b7ae7c17569b4b5ff5aa5e63ffa0 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 21 Jan 2019 14:49:47 +0100 Subject: [PATCH 555/638] contrib/kube-prometheus: Update kubernetes mixin for fixed CPUThrottlingHigh --- jsonnetfile.lock.json | 2 +- manifests/grafana-dashboardDefinitions.yaml | 4 ++-- manifests/prometheus-rules.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5c6929a4..86630edb 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" + "version": "2e358ff68f00bd0dead547beaddc6ce7526864e8" }, { "name": "grafonnet", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 4900caae..dc491cb6 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -6730,7 +6730,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6833,7 +6833,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 05e0debf..e578dad0 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -626,8 +626,8 @@ spec: }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) - by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", + }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25 \n" for: 15m labels: From f0d8ba64d7a0088d04aa1ad57dbb260381eb1c8b Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Mon, 21 Jan 2019 17:09:14 +0100 Subject: [PATCH 556/638] kube-prometheus: Remove enableAdminAPI configuration option Manifest generation fails if this option is not set. Instead, remove the option and require users to patch the object in case they need it. --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index b9b3810c..89d55bcc 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -168,7 +168,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; replicas: $._config.prometheus.replicas, version: $._config.versions.prometheus, baseImage: $._config.imageRepos.prometheus, - enableAdminAPI: $._config.prometheus.enableAdminAPI, serviceAccountName: 'prometheus-' + $._config.prometheus.name, serviceMonitorSelector: {}, serviceMonitorNamespaceSelector: {}, From 0bacc21a121b8fb9e88f4bc51250e185cac9cc03 Mon Sep 17 00:00:00 2001 From: Jason Murray Date: Wed, 23 Jan 2019 02:29:14 +0100 Subject: [PATCH 557/638] Kubernetes Version bump in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 442dd10b..a4e8c1f3 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ This adapter is an Extension API Server and Kubernetes needs to be have this fea In order to just try out this stack, start minikube with the following command: ``` -$ minikube delete && minikube start --kubernetes-version=v1.10.1 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +$ minikube delete && minikube start --kubernetes-version=v1.13.2 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 ``` ## Quickstart From 63bfbfa5692d779d6851a682c7a683b1c8475291 Mon Sep 17 00:00:00 2001 From: Jason Murray Date: Wed, 23 Jan 2019 02:31:26 +0100 Subject: [PATCH 558/638] Use kubectl apply instead of create Resources can be created _and_ updated with `apply` without receiving errors --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 442dd10b..d25e2c1a 100644 --- a/README.md +++ b/README.md @@ -71,13 +71,13 @@ This project is intended to be used as a library (i.e. the intent is not for you Though for a quickstart a compiled version of the Kubernetes [manifests](manifests) generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: * Simply create the stack: ``` -$ kubectl create -f manifests/ || true +$ kubectl apply -f manifests/ || true # It can take a few seconds for the above 'create manifests' command to fully create the following resources, so verify the resources are ready before proceeding. until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done -$ kubectl create -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice (to workaround a race condition). +$ kubectl apply -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice (to workaround a race condition). ``` * And to teardown the stack: ``` From 816f38811f274f289d230b557751835c96e933a8 Mon Sep 17 00:00:00 2001 From: Jason Murray Date: Wed, 23 Jan 2019 14:10:01 +0100 Subject: [PATCH 559/638] Remove || true from README --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d25e2c1a..f0fabe5a 100644 --- a/README.md +++ b/README.md @@ -71,17 +71,18 @@ This project is intended to be used as a library (i.e. the intent is not for you Though for a quickstart a compiled version of the Kubernetes [manifests](manifests) generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: * Simply create the stack: ``` -$ kubectl apply -f manifests/ || true +$ kubectl apply -f manifests/ # It can take a few seconds for the above 'create manifests' command to fully create the following resources, so verify the resources are ready before proceeding. -until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done -until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done +$ until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done +$ until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done -$ kubectl apply -f manifests/ 2>/dev/null || true # This command sometimes may need to be done twice (to workaround a race condition). +$ kubectl apply -f manifests/ # This command sometimes may need to be done twice (to workaround a race condition). ``` + * And to teardown the stack: ``` -$ kubectl delete -f manifests/ || true +$ kubectl delete -f manifests/ ``` ### Access the dashboards From 5ba6285ede463aec803600b6860c19e12d0bebe8 Mon Sep 17 00:00:00 2001 From: Laurent Godet Date: Fri, 11 Jan 2019 14:04:38 +0000 Subject: [PATCH 560/638] Update remaining node-exporter rules Regenerate kube-prometheus --- .../rules/node-rules.libsonnet | 14 +++++++------- jsonnetfile.lock.json | 6 +++--- manifests/prometheus-rules.yaml | 19 +++++++++++-------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/jsonnet/kube-prometheus/rules/node-rules.libsonnet b/jsonnet/kube-prometheus/rules/node-rules.libsonnet index ec3a331e..e3396b08 100644 --- a/jsonnet/kube-prometheus/rules/node-rules.libsonnet +++ b/jsonnet/kube-prometheus/rules/node-rules.libsonnet @@ -5,31 +5,31 @@ name: 'kube-prometheus-node-recording.rules', rules: [ { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)', record: 'instance:node_cpu:rate:sum', }, { - expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', + expr: 'sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)', record: 'instance:node_filesystem_usage:sum', }, { - expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', + expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)', record: 'instance:node_network_receive_bytes:rate:sum', }, { - expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', + expr: 'sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)', record: 'instance:node_network_transmit_bytes:rate:sum', }, { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)', record: 'instance:node_cpu:ratio', }, { - expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))', record: 'cluster:node_cpu:sum_rate5m', }, { - expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', + expr: 'cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))', record: 'cluster:node_cpu:ratio', }, ], diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 86630edb..f1ebbce6 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "0905fc6d7d74f1c89a88e966ef7d29c55bcf1f35" + "version": "53c89042b8b22d160040e8322358cbcdeb74ab0c" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "2e358ff68f00bd0dead547beaddc6ce7526864e8" + "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a00bff7848db1dcead692e2bea1d7c87e8a2c157" + "version": "fae6e92407e004894f5e0d71baab212732ddd8c2" } ] } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index e578dad0..04905a34 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -288,21 +288,24 @@ spec: record: 'node:node_inodes_free:' - name: kube-prometheus-node-recording.rules rules: - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) record: instance:node_filesystem_usage:sum - - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) - / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kubernetes-absent rules: From 6c375d1100702de8c031d6e055207706907a0442 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 24 Jan 2019 12:09:26 +0100 Subject: [PATCH 561/638] contrib/kube-prometheus: Update revisions for jsonnet dependencies --- jsonnetfile.lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index f1ebbce6..bdb38c6e 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "53c89042b8b22d160040e8322358cbcdeb74ab0c" + "version": "e9469aa907c5e636b1d4888d1b788d1d4fb57839" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" + "version": "2e358ff68f00bd0dead547beaddc6ce7526864e8" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "fae6e92407e004894f5e0d71baab212732ddd8c2" + "version": "a00bff7848db1dcead692e2bea1d7c87e8a2c157" } ] } From 0687c56ceaffdb02ba486be3b9913650e36b3ebf Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 24 Jan 2019 11:30:21 +0100 Subject: [PATCH 562/638] kube-prometheus: Add e2e tests --- Makefile | 3 + tests/e2e/main_test.go | 118 +++++++++++++++++++++++++++++++++ tests/e2e/prometheus_client.go | 52 +++++++++++++++ tests/e2e/travis-e2e.sh | 24 +++++++ 4 files changed, 197 insertions(+) create mode 100644 tests/e2e/main_test.go create mode 100644 tests/e2e/prometheus_client.go create mode 100755 tests/e2e/travis-e2e.sh diff --git a/Makefile b/Makefile index f40e8104..6b3651ae 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,9 @@ test: $(JB_BINARY) $(JB_BINARY) install ./test.sh +test-e2e: + go test -timeout 55m -v ./tests/e2e + test-in-docker: ../../hack/jsonnet-docker-image @echo ">> Compiling assets and generating Kubernetes manifests" docker run \ diff --git a/tests/e2e/main_test.go b/tests/e2e/main_test.go new file mode 100644 index 00000000..e63730a3 --- /dev/null +++ b/tests/e2e/main_test.go @@ -0,0 +1,118 @@ +// Copyright 2019 The prometheus-operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "log" + "os" + "testing" + "time" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +var promClient *prometheusClient + +func TestMain(m *testing.M) { + os.Exit(testMain(m)) +} + +// testMain circumvents the issue, that one can not call `defer` in TestMain, as +// `os.Exit` does not honor `defer` statements. For more details see: +// http://blog.englund.nu/golang,/testing/2017/03/12/using-defer-in-testmain.html +func testMain(m *testing.M) int { + kubeConfigPath, ok := os.LookupEnv("KUBECONFIG") + if !ok { + log.Fatal("failed to retrieve KUBECONFIG env var") + } + + config, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath) + if err != nil { + log.Fatal(err) + } + + kubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + log.Fatal(errors.Wrap(err, "creating kubeClient failed")) + } + + promClient = newPrometheusClient(kubeClient) + + return m.Run() +} + +func TestQueryPrometheus(t *testing.T) { + t.Parallel() + queries := []struct { + query string + expectN int + }{ + { + // query: `up{job="node-exporter"} == 1`, + // expectN: 1, + // }, { + // query: `up{job="kubelet"} == 1`, + // expectN: 1, + // }, { + query: `up{job="apiserver"} == 1`, + expectN: 1, + // }, { + // query: `up{job="kube-state-metrics"} == 1`, + // expectN: 1, + }, { + query: `up{job="prometheus-k8s"} == 1`, + expectN: 1, + }, { + query: `up{job="prometheus-operator"} == 1`, + expectN: 1, + }, { + query: `up{job="alertmanager-main"} == 1`, + expectN: 2, + }, + } + + // Wait for pod to respond at queries at all. Then start verifying their results. + err := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) { + _, err := promClient.query("up") + return err == nil, nil + }) + if err != nil { + t.Fatal(errors.Wrap(err, "wait for prometheus-k8s")) + } + + err = wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) { + defer t.Log("---------------------------\n") + + for _, q := range queries { + n, err := promClient.query(q.query) + if err != nil { + return false, err + } + if n < q.expectN { + // Don't return an error as targets may only become visible after a while. + t.Logf("expected at least %d results for %q but got %d", q.expectN, q.query, n) + return false, nil + } + t.Logf("query %q succeeded", q.query) + } + return true, nil + }) + if err != nil { + t.Fatal(err) + } +} diff --git a/tests/e2e/prometheus_client.go b/tests/e2e/prometheus_client.go new file mode 100644 index 00000000..b87ce3e5 --- /dev/null +++ b/tests/e2e/prometheus_client.go @@ -0,0 +1,52 @@ +// Copyright 2019 The prometheus-operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "k8s.io/client-go/kubernetes" + + "github.com/Jeffail/gabs" +) + +type prometheusClient struct { + kubeClient kubernetes.Interface +} + +func newPrometheusClient(kubeClient kubernetes.Interface) *prometheusClient { + return &prometheusClient{kubeClient} +} + +// Query makes a request against the Prometheus /api/v1/query endpoint. +func (c *prometheusClient) query(query string) (int, error) { + req := c.kubeClient.CoreV1().RESTClient().Get(). + Namespace("monitoring"). + Resource("pods"). + SubResource("proxy"). + Name("prometheus-k8s-0:9090"). + Suffix("/api/v1/query").Param("query", query) + + b, err := req.DoRaw() + if err != nil { + return 0, err + } + + res, err := gabs.ParseJSON(b) + if err != nil { + return 0, err + } + + n, err := res.ArrayCountP("data.result") + return n, err +} diff --git a/tests/e2e/travis-e2e.sh b/tests/e2e/travis-e2e.sh new file mode 100755 index 00000000..45fb974a --- /dev/null +++ b/tests/e2e/travis-e2e.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u +# print each command before executing it +set -x + +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") + +"${SCRIPT_DIR}"/../../../../scripts/create-minikube.sh + +# waiting for kube-dns to be ready +JSONPATH='{range .items[*]}{@.metadata.name}:{range @.status.conditions[*]}{@.type}={@.status};{end}{end}'; until kubectl -n kube-system get pods -lk8s-app=kube-dns -o jsonpath="$JSONPATH" 2>&1 | grep -q "Ready=True"; do sleep 1;echo "waiting for kube-dns to be available"; kubectl get pods --all-namespaces; done + +( + cd "${SCRIPT_DIR}"/../.. || exit + kubectl apply -f manifests + KUBECONFIG=~/.kube/config make test-e2e +) + +"${SCRIPT_DIR}"/../../../../scripts/delete-minikube.sh From 491a2039d7287f92e2e25757c2fea37b2c52255c Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Tue, 22 Jan 2019 11:50:34 +0100 Subject: [PATCH 563/638] kube-prometheus: Update Alertmanager to v0.16.0 --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 347d9a3c..02909525 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.15.3', + alertmanager: 'v0.16.0', }, imageRepos+:: { From 9f25b17e80c5df11034fea3aed02a88efdd0b48a Mon Sep 17 00:00:00 2001 From: Max Leonard Inden Date: Thu, 24 Jan 2019 15:19:32 +0100 Subject: [PATCH 564/638] kube-prometheus: Re-generate manifests --- jsonnetfile.lock.json | 2 +- manifests/alertmanager-alertmanager.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index bdb38c6e..3f494915 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e9469aa907c5e636b1d4888d1b788d1d4fb57839" + "version": "698d6d750a9f46d0db4c7ae47ed293979921e89c" }, { "name": "ksonnet", diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index c6f8ce05..376c17ba 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -15,4 +15,4 @@ spec: runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.15.3 + version: v0.16.0 From 4c9912ece53796c900e5a128b9421be3d17c14e0 Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Thu, 24 Jan 2019 19:57:19 +0300 Subject: [PATCH 565/638] sync rules --- jsonnetfile.lock.json | 10 ++++----- manifests/grafana-dashboardDefinitions.yaml | 24 +++++++++++++++++++++ manifests/prometheus-rules.yaml | 6 ++++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 3f494915..cfd96219 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "698d6d750a9f46d0db4c7ae47ed293979921e89c" + "version": "e578c3af3a5c6ac26207c4b2ae92af6298e32342" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "2e358ff68f00bd0dead547beaddc6ce7526864e8" + "version": "3b031fd4bb5c3027ab5e76a342758c203b535db0" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "d05a9bd3adfba39ed3b1987a428d9e9863428df5" + "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "d75c3b260c1077c924d7ea0240250afc235c4cb3" + "version": "eb4d2218cefc621cd31041d46c3dbf328325d68f" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a00bff7848db1dcead692e2bea1d7c87e8a2c157" + "version": "fa521f4e00fedfb6d98449d92a6408d0b3b0d922" } ] } diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index dc491cb6..f3374edc 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4896,6 +4896,12 @@ items: data: nodes.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6208,6 +6214,12 @@ items: data: persistentvolumesusage.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6551,6 +6563,12 @@ items: data: pods.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -7035,6 +7053,12 @@ items: data: statefulset.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 04905a34..5729174f 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -776,7 +776,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 7 days. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 7 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 @@ -784,7 +785,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 24 hours. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 24 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 From 264cf11bf0120529679c2275a137e163005f9eb9 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 21 Jan 2019 20:04:42 +0100 Subject: [PATCH 566/638] contrib/kube-prometheus: Begin to fix cipher suites --- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 12 ++++++------ .../kube-state-metrics/kube-state-metrics.libsonnet | 1 + .../node-exporter/node-exporter.libsonnet | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 7dae5f38..4b0d5a22 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -59,12 +59,12 @@ local configMapList = k.core.v1.configMapList; 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', - 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', - 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', - 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', - 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + // 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // Doesn't work with h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // Doesn't work with h2 + // 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2 + // 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2 + // 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2 ], cadvisorSelector: 'job="kubelet"', diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 30a176f5..2cf7ebbc 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -134,6 +134,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local proxyClusterMetrics = container.new('kube-rbac-proxy-main', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ + '--logtostderr', '--secure-listen-address=:8443', '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:8081/', diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 7030d2f9..052f8dc6 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -106,6 +106,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local proxy = container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ + '--logtostderr', '--secure-listen-address=$(IP):' + $._config.nodeExporter.port, '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', From 8a29b4f383be0bcf61a847d1173b04bca148c292 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 22 Jan 2019 16:59:34 +0100 Subject: [PATCH 567/638] Update ciphers and infos for them --- .../kube-prometheus/kube-prometheus.libsonnet | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 4b0d5a22..1e9f8565 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -43,24 +43,28 @@ local configMapList = k.core.v1.configMapList; namespace: 'default', tlsCipherSuites: [ + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - 'TLS_RSA_WITH_AES_128_CBC_SHA', - 'TLS_RSA_WITH_AES_256_CBC_SHA', + // 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 'TLS_RSA_WITH_AES_128_CBC_SHA256', - 'TLS_RSA_WITH_AES_128_GCM_SHA256', - 'TLS_RSA_WITH_AES_256_GCM_SHA384', + // 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2 // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', - 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', + // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA',// disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA',// disabled by h2 // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', + // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', - // 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // Doesn't work with h2 - // 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // Doesn't work with h2 + + // disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go + // 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2 // 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2 // 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2 From 7bc69645b0e6c0ca9744b33cc42f4e82cd3cde83 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 23 Jan 2019 17:35:26 +0100 Subject: [PATCH 568/638] contrib/kube-prometheus: Log to stderr in kube-state-metrics rbac-proxy --- .../kube-state-metrics/kube-state-metrics.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 2cf7ebbc..01014234 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -146,6 +146,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local proxySelfMetrics = container.new('kube-rbac-proxy-self', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ + '--logtostderr', '--secure-listen-address=:9443', '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--upstream=http://127.0.0.1:8082/', From f531cc4883b3a97d08ea14532b741b8634e0d33d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 23 Jan 2019 18:09:41 +0100 Subject: [PATCH 569/638] contrib/kube-prometheus: Bump kube-rbac-proxy version to v0.4.1 --- .../kube-state-metrics/kube-state-metrics.libsonnet | 2 +- jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 01014234..a0dddff2 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -17,7 +17,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { kubeStateMetrics: 'v1.5.0', - kubeRbacProxy: 'v0.4.0', + kubeRbacProxy: 'v0.4.1', addonResizer: '1.0', }, diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 052f8dc6..93c90a1f 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { nodeExporter: 'v0.17.0', - kubeRbacProxy: 'v0.4.0', + kubeRbacProxy: 'v0.4.1', }, imageRepos+:: { From d4ce83203c766bea942c55bacee1ebdb7bd528aa Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 23 Jan 2019 18:30:00 +0100 Subject: [PATCH 570/638] contrib/kube-prometheus: Generate new manifests with update ciphers --- jsonnetfile.lock.json | 2 +- manifests/kube-state-metrics-deployment.yaml | 10 ++++++---- manifests/node-exporter-daemonset.yaml | 5 +++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index cfd96219..1ae4e521 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e578c3af3a5c6ac26207c4b2ae92af6298e32342" + "version": "9c0d2e34fa0a8bc22049e50bae46f4bb87ec2045" }, { "name": "ksonnet", diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 1e8fc1e2..94f7b36a 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -17,10 +17,11 @@ spec: spec: containers: - args: + - --logtostderr - --secure-listen-address=:8443 - - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --upstream=http://127.0.0.1:8081/ - image: quay.io/coreos/kube-rbac-proxy:v0.4.0 + image: quay.io/coreos/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy-main ports: - containerPort: 8443 @@ -33,10 +34,11 @@ spec: cpu: 10m memory: 20Mi - args: + - --logtostderr - --secure-listen-address=:9443 - - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --upstream=http://127.0.0.1:8082/ - image: quay.io/coreos/kube-rbac-proxy:v0.4.0 + image: quay.io/coreos/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy-self ports: - containerPort: 9443 diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index c3de3679..e8ea15f1 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -43,15 +43,16 @@ spec: name: root readOnly: true - args: + - --logtostderr - --secure-listen-address=$(IP):9100 - - --tls-cipher-suites=TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --upstream=http://127.0.0.1:9100/ env: - name: IP valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/coreos/kube-rbac-proxy:v0.4.0 + image: quay.io/coreos/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy ports: - containerPort: 9100 From fd697ffbff6299ddab58f98181afde616e459409 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Mon, 28 Jan 2019 14:20:58 +0100 Subject: [PATCH 571/638] contrib/kube-prometheus: remove node role This commit removes get/list/watch on nodes for the Prometheus-k8s instance, as Prometheus pods do not need that privilege for anything. --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 89d55bcc..9dd9b7cc 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -129,7 +129,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local coreRule = policyRule.new() + policyRule.withApiGroups(['']) + policyRule.withResources([ - 'nodes', 'services', 'endpoints', 'pods', From 274e1d5924151dfcf537d5750e1a652055a63381 Mon Sep 17 00:00:00 2001 From: Lucas Serven Date: Mon, 28 Jan 2019 14:33:26 +0100 Subject: [PATCH 572/638] contrib/kube-prometheus: regenerate --- jsonnetfile.lock.json | 2 +- manifests/prometheus-roleSpecificNamespaces.yaml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 1ae4e521..46794798 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9c0d2e34fa0a8bc22049e50bae46f4bb87ec2045" + "version": "d5f758dc5d07b214cd5cdf639847ab0197f42f76" }, { "name": "ksonnet", diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index b305774a..b920b886 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -9,7 +9,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -26,7 +25,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -43,7 +41,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods From 8e79ff2663ab6935470a819b130967fca8a63f83 Mon Sep 17 00:00:00 2001 From: Jason Murray Date: Thu, 31 Jan 2019 20:17:23 +0100 Subject: [PATCH 573/638] Fixed paths for patching manifests --- docs/GKE-cadvisor-support.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/GKE-cadvisor-support.md b/docs/GKE-cadvisor-support.md index f1a88825..0d763ac0 100644 --- a/docs/GKE-cadvisor-support.md +++ b/docs/GKE-cadvisor-support.md @@ -20,19 +20,17 @@ Or, you can patch and re-apply your existing manifests with: On linux: ``` -sed -i -e 's/https/http/g' \ -contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +sed -i -e 's/https/http/g' manifests/prometheus-serviceMonitorKubelet.yaml ``` On MacOs: ``` -sed -i '' -e 's/https/http/g' \ -contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +sed -i '' -e 's/https/http/g' manifests/prometheus-serviceMonitorKubelet.yaml ``` After you have modified the yaml file please run ``` -kubectl apply -f contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +kubectl apply -f manifests/prometheus-serviceMonitorKubelet.yaml ``` From 3cb249d061f3cf7d8a89d11aa7c461b5717795e6 Mon Sep 17 00:00:00 2001 From: Sa'ad Date: Fri, 1 Feb 2019 10:51:20 +0000 Subject: [PATCH 574/638] FIx typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aabcb510..07d26b5b 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ $ kubectl delete -f manifests/ Prometheus, Grafana, and Alertmanager dashboards can be accessed quickly using `kubectl port-forward` after running the quickstart via the commands below. Kubernetes 1.10 or later is required. -> Note: There are instructions on how to route to these pods behdind an ingress controller in the [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) section. +> Note: There are instructions on how to route to these pods behind an ingress controller in the [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) section. Prometheus From a09da4faf516e172e258ccc91944d750efa4ae74 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 4 Feb 2019 15:45:52 +0100 Subject: [PATCH 575/638] Update component versions in docs and jsonnets --- README.md | 14 ++--- .../jsonnet-snippets/kops-coredns.jsonnet | 2 +- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.lock.json | 10 +-- ...r-0prometheusCustomResourceDefinition.yaml | 61 +++++++++++++++++-- .../0prometheus-operator-clusterRole.yaml | 2 + .../0prometheus-operator-deployment.yaml | 4 +- manifests/prometheus-rules.yaml | 38 ++++++------ 8 files changed, 94 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index aabcb510..9a8354d3 100644 --- a/README.md +++ b/README.md @@ -260,13 +260,13 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.15.3", - nodeExporter: "v0.16.0", - kubeStateMetrics: "v1.3.1", - kubeRbacProxy: "v0.3.1", + alertmanager: "v0.16.0", + nodeExporter: "v0.17.0", + kubeStateMetrics: "v1.5.0", + kubeRbacProxy: "v0.4.1", addonResizer: "1.0", - prometheusOperator: "v0.24.0", - prometheus: "v2.4.3", + prometheusOperator: "v0.28.0", + prometheus: "v2.5.0", }, imageRepos+:: { @@ -374,7 +374,7 @@ If your kops cluster is using CoreDNS, there is an additional mixin to import. [embedmd]:# (examples/jsonnet-snippets/kops-coredns.jsonnet) ```jsonnet (import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + (import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') ``` diff --git a/examples/jsonnet-snippets/kops-coredns.jsonnet b/examples/jsonnet-snippets/kops-coredns.jsonnet index 6e308579..6ba445df 100644 --- a/examples/jsonnet-snippets/kops-coredns.jsonnet +++ b/examples/jsonnet-snippets/kops-coredns.jsonnet @@ -1,3 +1,3 @@ (import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + +(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + (import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 652d47a1..7be8827a 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.27.0" + "version": "v0.28.0" }, { "name": "etcd-mixin", diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 46794798..31ffed35 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "d5f758dc5d07b214cd5cdf639847ab0197f42f76" + "version": "9c1ad1e863ddae4ec43f58e260077ec91ea2ae37" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "3b031fd4bb5c3027ab5e76a342758c203b535db0" + "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "eb4d2218cefc621cd31041d46c3dbf328325d68f" + "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" }, { "name": "grafana", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" + "version": "338addbabc8a29b46840df0bb0355c12b96a6f21" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "fa521f4e00fedfb6d98449d92a6408d0b3b0d922" + "version": "1fe6f109c87c4fa47775426a6a60c3b954ed5c33" } ] } diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 627ce96d..158c5cb3 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,6 +1538,14 @@ spec: required: - name type: array + enableAdminAPI: + description: 'Enable access to prometheus web admin API. Defaults to + the value of `false`. WARNING: Enabling the admin APIs enables mutating + endpoints, to delete data, shutdown Prometheus, and more. Enabling + this should be done with care and the user is advised to add additional + authentication authorization via a proxy to ensure only clients authorized + to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' + type: boolean evaluationInterval: description: Interval between consecutive evaluations. type: string @@ -1572,6 +1580,9 @@ spec: description: ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP. type: boolean + logFormat: + description: Log format for Prometheus to be configured with. + type: string logLevel: description: Log level for Prometheus to be configured with. type: string @@ -2059,6 +2070,11 @@ spec: description: MinBackoff is the initial retry delay. Gets doubled for every retry. type: string + minShards: + description: MinShards is the minimum number of shards, i.e. + amount of concurrency. + format: int32 + type: integer remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string @@ -2243,6 +2259,25 @@ spec: "In", and the values array contains only "value". The requirements are ANDed. type: object + rules: + description: /--rules.*/ command-line arguments + properties: + alert: + description: /--rules.alert.*/ command-line arguments + properties: + forGracePeriod: + description: Minimum duration between alert and restored 'for' + state. This is maintained only for alerts with configured + 'for' time greater than grace period. + type: string + forOutageTolerance: + description: Max time to tolerate prometheus outage for restoring + 'for' state of alert. + type: string + resendDelay: + description: Minimum amount of time to wait before resending + an alert to Alertmanager. + type: string scrapeInterval: description: Interval between consecutive scrapes. type: string @@ -2941,8 +2976,9 @@ spec: description: Thanos base image if other than default. type: string gcs: - description: ThanosGCSSpec defines parameters for use of Google - Cloud Storage (GCS) with Thanos. + description: 'Deprecated: ThanosGCSSpec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosGCSSpec + will be removed.' properties: bucket: description: Google Cloud Storage bucket name for stored blocks. @@ -2970,6 +3006,22 @@ spec: to ensure the Prometheus Operator knows what version of Thanos is being configured. type: string + objectStorageConfig: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be + defined + type: boolean + required: + - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string @@ -2988,8 +3040,9 @@ spec: to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object s3: - description: ThanosS3Spec defines parameters for of AWS Simple Storage - Service (S3) with Thanos. (S3 compatible services apply as well) + description: 'Deprecated: ThanosS3Spec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosS3Spec + will be removed.' properties: accessKey: description: SecretKeySelector selects a key of a Secret. diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index e0ac283a..123f78e9 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -44,11 +44,13 @@ rules: - "" resources: - services + - services/finalizers - endpoints verbs: - get - create - update + - delete - apiGroups: - "" resources: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 55ad9043..1f880582 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.27.0 - image: quay.io/coreos/prometheus-operator:v0.27.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.28.0 + image: quay.io/coreos/prometheus-operator:v0.28.0 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 5729174f..443943c0 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -314,7 +314,7 @@ spec: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | - absent(up{job="alertmanager-main"} == 1) + absent(up{job="alertmanager-main",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -386,7 +386,7 @@ spec: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | - absent(up{job="prometheus-k8s"} == 1) + absent(up{job="prometheus-k8s",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -395,7 +395,7 @@ spec: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | - absent(up{job="prometheus-operator"} == 1) + absent(up{job="prometheus-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -799,7 +799,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical @@ -808,7 +808,7 @@ spec: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -816,9 +816,9 @@ spec: annotations: message: Alertmanager has not found all other members of the cluster. expr: | - alertmanager_cluster_members{job="alertmanager-main"} + alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) for: 5m labels: severity: critical @@ -865,7 +865,7 @@ spec: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} summary: Reloading Prometheus' configuration failed expr: | - prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -875,7 +875,7 @@ spec: $labels.pod}} summary: Prometheus' alert notification queue is running full expr: | - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"} for: 10m labels: severity: warning @@ -885,7 +885,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alert from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01 for: 10m labels: severity: warning @@ -895,7 +895,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alerts from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03 for: 10m labels: severity: critical @@ -905,7 +905,7 @@ spec: to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers expr: | - prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1 for: 10m labels: severity: warning @@ -915,7 +915,7 @@ spec: reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -925,7 +925,7 @@ spec: compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -935,7 +935,7 @@ spec: log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | - tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 for: 4h labels: severity: warning @@ -945,7 +945,7 @@ spec: samples. summary: Prometheus isn't ingesting samples expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning @@ -955,7 +955,7 @@ spec: due to duplicate timestamps but different values' summary: Prometheus has many samples rejected expr: | - increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning @@ -966,7 +966,7 @@ spec: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -974,7 +974,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning From c01be8de068252a5a033aaaab8aaace02fc7cb4c Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 4 Feb 2019 20:34:27 +0100 Subject: [PATCH 576/638] Update self-vendoring of jsonnet libs --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 31ffed35..07128558 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9c1ad1e863ddae4ec43f58e260077ec91ea2ae37" + "version": "3f9400b1788a78c17abb0159cd84295cfc93bc15" }, { "name": "ksonnet", From 31de04d69ca4b485b593cb2a36980c0aaafb6bd5 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 4 Feb 2019 17:43:48 +0100 Subject: [PATCH 577/638] kube-prometheus: Rename DeadMansSwitch to Watchdog --- jsonnet/kube-prometheus/alerts/general.libsonnet | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index 6ac25703..8802097e 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -16,9 +16,15 @@ }, }, { - alert: 'DeadMansSwitch', + alert: 'Watchdog', annotations: { - message: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + message: ||| + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + |||, }, expr: 'vector(1)', labels: { From 43b9680a96c3241872776dccfcf5da8dbd939343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=93scar=20Mu=C3=B1oz=20Garrig=C3=B3s?= Date: Tue, 5 Feb 2019 11:32:44 +0100 Subject: [PATCH 578/638] Adding -f flag to fit with new kubectl versions --- experimental/custom-metrics-api/deploy.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/experimental/custom-metrics-api/deploy.sh b/experimental/custom-metrics-api/deploy.sh index 1ac74878..d276afc0 100644 --- a/experimental/custom-metrics-api/deploy.sh +++ b/experimental/custom-metrics-api/deploy.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -kubectl apply -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml -kubectl apply -n monitoring custom-metrics-apiservice.yaml -kubectl apply -n monitoring custom-metrics-cluster-role.yaml -kubectl apply -n monitoring custom-metrics-configmap.yaml -kubectl apply -n monitoring hpa-custom-metrics-cluster-role-binding.yaml +kubectl apply -n monitoring -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl apply -n monitoring -f custom-metrics-apiservice.yaml +kubectl apply -n monitoring -f custom-metrics-cluster-role.yaml +kubectl apply -n monitoring -f custom-metrics-configmap.yaml +kubectl apply -n monitoring -f hpa-custom-metrics-cluster-role-binding.yaml From 8667c51acd0940f5deaffd48f19110e788941c3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=93scar=20Mu=C3=B1oz=20Garrig=C3=B3s?= Date: Tue, 5 Feb 2019 11:52:04 +0100 Subject: [PATCH 579/638] Removing namespace when creating srv/sample-app --- experimental/custom-metrics-api/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/custom-metrics-api/README.md b/experimental/custom-metrics-api/README.md index c5c7102c..e93d809f 100644 --- a/experimental/custom-metrics-api/README.md +++ b/experimental/custom-metrics-api/README.md @@ -14,7 +14,7 @@ When you're done, you can teardown using the `./teardown.sh` script. Additionally, this directory contains a sample app that uses the [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) to scale the Deployment's replicas of Pods up and down as needed. Deploy this app by running `kubectl apply -f sample-app.yaml`. -Make the app accessible on your system, for example by using `kubectl -n monitoring port-forward svc/sample-app 8080`. Next you need to put some load on its http endpoints. +Make the app accessible on your system, for example by using `kubectl port-forward svc/sample-app 8080`. Next you need to put some load on its http endpoints. A tool like [hey](https://github.com/rakyll/hey) is helpful for doing so: `hey -c 20 -n 100000000 http://localhost:8080/metrics` From b700788d75edb1599d117ffeae16185ce97dcbf4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Feb 2019 12:02:33 +0100 Subject: [PATCH 580/638] kube-prometheus: Update jsonnet deps --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 07128558..f194ca86 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3f9400b1788a78c17abb0159cd84295cfc93bc15" + "version": "f12ea289a1a99cf9fe433ec3a086d436baef6466" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" + "version": "24590e90dcfc9d581c4208b4bb3d15df50f1328e" }, { "name": "grafana", From ff0cb3f52aa95f7fc7883779015a4a1cc82cb7d3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Feb 2019 12:16:06 +0100 Subject: [PATCH 581/638] kube-prometheus: Re-generate --- manifests/grafana-dashboardDefinitions.yaml | 72 ++++++++++----------- manifests/prometheus-rules.yaml | 10 ++- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index f3374edc..92fdc805 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -31,7 +31,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -117,7 +117,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -215,7 +215,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -301,7 +301,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -399,7 +399,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -485,7 +485,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -583,7 +583,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -669,7 +669,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -767,7 +767,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -947,7 +947,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -1033,7 +1033,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -1131,7 +1131,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -1217,7 +1217,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -1315,7 +1315,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -1401,7 +1401,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -1499,7 +1499,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -1585,7 +1585,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -1683,7 +1683,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -1891,7 +1891,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -1975,7 +1975,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -2059,7 +2059,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -2143,7 +2143,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -2227,7 +2227,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -2311,7 +2311,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -2406,7 +2406,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -2504,7 +2504,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -2771,7 +2771,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -2869,7 +2869,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -3218,7 +3218,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -3316,7 +3316,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -3583,7 +3583,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -3681,7 +3681,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -4057,7 +4057,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -4155,7 +4155,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -4422,7 +4422,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -4520,7 +4520,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 443943c0..aaf16ff5 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -831,10 +831,14 @@ spec: for: 10m labels: severity: warning - - alert: DeadMansSwitch + - alert: Watchdog annotations: - message: This is a DeadMansSwitch meant to ensure that the entire alerting - pipeline is functional. + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. expr: vector(1) labels: severity: none From 0ca6dbc33717bbca5a38583bf771bc4dfcad43f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=93scar=20Mu=C3=B1oz=20Garrig=C3=B3s?= Date: Tue, 5 Feb 2019 13:26:55 +0100 Subject: [PATCH 582/638] Adding -f flag to fit with new kubectl versions --- experimental/custom-metrics-api/teardown.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/experimental/custom-metrics-api/teardown.sh b/experimental/custom-metrics-api/teardown.sh index a62f685e..b3a455f5 100644 --- a/experimental/custom-metrics-api/teardown.sh +++ b/experimental/custom-metrics-api/teardown.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -kubectl delete -n monitoring custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml -kubectl delete -n monitoring custom-metrics-apiservice.yaml -kubectl delete -n monitoring custom-metrics-cluster-role.yaml -kubectl delete -n monitoring custom-metrics-configmap.yaml -kubectl delete -n monitoring hpa-custom-metrics-cluster-role-binding.yaml +kubectl delete -n monitoring -f custom-metrics-apiserver-resource-reader-cluster-role-binding.yaml +kubectl delete -n monitoring -f custom-metrics-apiservice.yaml +kubectl delete -n monitoring -f custom-metrics-cluster-role.yaml +kubectl delete -n monitoring -f custom-metrics-configmap.yaml +kubectl delete -n monitoring -f hpa-custom-metrics-cluster-role-binding.yaml From ee65031526968cf6c5c8a2b003185aaa76215cbf Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Feb 2019 14:04:14 +0100 Subject: [PATCH 583/638] kube-prometheus: Update Grafana to latest beta --- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 932eba85..c30f13f9 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -42,8 +42,12 @@ local configMapList = k.core.v1.configMapList; _config+:: { namespace: 'default', + versions+:: { + grafana: '6.0.0-beta1', + }, + tlsCipherSuites: [ - 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 From a50c13dd2b5be6089fac324093c7a418adcab568 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Feb 2019 14:07:05 +0100 Subject: [PATCH 584/638] kube-prometheus: Bump jsonnet deps --- jsonnetfile.lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index f194ca86..a3561854 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "f12ea289a1a99cf9fe433ec3a086d436baef6466" + "version": "6963b7410f28575a90a65c0aee1c79c8ef392fbb" }, { "name": "ksonnet", From f1ae21f9582cdb7c0bf73e7ebc7915b091d49379 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 5 Feb 2019 14:17:51 +0100 Subject: [PATCH 585/638] kube-prometheus: Re-generate --- manifests/grafana-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 60ffc3b4..2caaca6d 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:5.2.4 + - image: grafana/grafana:6.0.0-beta1 name: grafana ports: - containerPort: 3000 From 8700e909ad818e0b0cb959f9321a262632feded6 Mon Sep 17 00:00:00 2001 From: Arslanbekov Denis Date: Tue, 5 Feb 2019 20:03:21 +0300 Subject: [PATCH 586/638] Fix links --- ...etheus-operator-0servicemonitorCustomResourceDefinition.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index d2e310fd..431bde39 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -138,7 +138,7 @@ spec: type: string relabelings: description: 'RelabelConfigs to apply to samples before ingestion. - More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#' + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines From 3ce06354526d25503930395c0202242dcdc1bb4b Mon Sep 17 00:00:00 2001 From: Arslanbekov Denis Date: Wed, 6 Feb 2019 00:21:51 +0300 Subject: [PATCH 587/638] Fix links in autogeneration files --- ...etheus-operator-0servicemonitorCustomResourceDefinition.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 431bde39..d2e310fd 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -138,7 +138,7 @@ spec: type: string relabelings: description: 'RelabelConfigs to apply to samples before ingestion. - More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines From a1b85016a3afe66341939920ac8d33b3709267be Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 6 Feb 2019 14:29:26 +0100 Subject: [PATCH 588/638] kube-prometheus: Drop disabled and high cardinality metrics --- .../prometheus/prometheus.libsonnet | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 9dd9b7cc..453ab79c 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -282,6 +282,22 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; insecureSkipVerify: true, }, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: [ + // Drop container_* metrics with no image. + { + sourceLabels: ['__name__', 'image'], + regex: 'container_([a-z_]+);', + action: 'drop', + }, + + // Drop a bunch of metrics which are disabled but still sent, see + // https://github.com/google/cadvisor/issues/1925. + { + sourceLabels: ['__name__'], + regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', + action: 'drop', + }, + ], }, ], selector: { @@ -374,6 +390,16 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; regex: 'etcd_(debugging|disk|request|server).*', action: 'drop', }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_controller_admission_latencies_seconds_.*', + action: 'drop', + }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_step_admission_latencies_seconds_.*', + action: 'drop', + }, ], }, ], From e3ae1cc350f46d7e54ea36c57d2032792e1bc586 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 6 Feb 2019 14:32:28 +0100 Subject: [PATCH 589/638] kube-prometheus: Update jsonnet deps --- jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index a3561854..6ceabcca 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "6963b7410f28575a90a65c0aee1c79c8ef392fbb" + "version": "986d387aaa6c292c248fc9d31c8b564462bd619e" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" + "version": "ae5d0b27229765fc0670c48c09a95cb6da732de3" }, { "name": "grafonnet", From e4991fe7e5728ce9a7945f26f9b3e09f226b8e09 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 6 Feb 2019 14:37:10 +0100 Subject: [PATCH 590/638] kube-prometheus: Re-generate --- manifests/prometheus-rules.yaml | 16 ++++++++-------- .../prometheus-serviceMonitorApiserver.yaml | 8 ++++++++ manifests/prometheus-serviceMonitorKubelet.yaml | 10 ++++++++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index aaf16ff5..a4da3102 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -241,25 +241,25 @@ spec: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | - sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml index 6d884a2b..5dea38e4 100644 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ b/manifests/prometheus-serviceMonitorApiserver.yaml @@ -14,6 +14,14 @@ spec: regex: etcd_(debugging|disk|request|server).* sourceLabels: - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ port: https scheme: https tlsConfig: diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml index 97d7f1a1..590a5cd4 100644 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -17,6 +17,16 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true interval: 30s + metricRelabelings: + - action: drop + regex: container_([a-z_]+); + sourceLabels: + - __name__ + - image + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ path: /metrics/cadvisor port: https-metrics scheme: https From fbc92dd9b31b4793f8a7d57acc4367aadb209820 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 14 Feb 2019 19:31:32 -0200 Subject: [PATCH 591/638] Update to support new version of addon-resizer --- README.md | 10 +++++----- .../kube-state-metrics/kube-state-metrics.libsonnet | 6 +++--- jsonnetfile.lock.json | 6 +++--- manifests/kube-state-metrics-deployment.yaml | 4 ++-- manifests/prometheus-rules.yaml | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 914a63c2..1dec2c2b 100644 --- a/README.md +++ b/README.md @@ -264,7 +264,7 @@ These are the available fields with their respective default values: nodeExporter: "v0.17.0", kubeStateMetrics: "v1.5.0", kubeRbacProxy: "v0.4.1", - addonResizer: "1.0", + addonResizer: "2.1", prometheusOperator: "v0.28.0", prometheus: "v2.5.0", }, @@ -274,7 +274,7 @@ These are the available fields with their respective default values: alertmanager: "quay.io/prometheus/alertmanager", kubeStateMetrics: "quay.io/coreos/kube-state-metrics", kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - addonResizer: "quay.io/coreos/addon-resizer", + addonResizer: "gcr.io/google-containers/addon-resizer-amd64", nodeExporter: "quay.io/prometheus/node-exporter", prometheusOperator: "quay.io/coreos/prometheus-operator", }, @@ -402,9 +402,9 @@ To produce the `docker pull/tag/push` commands that will synchronize upstream im ```shell $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet -docker pull quay.io/coreos/addon-resizer:1.0 -docker tag quay.io/coreos/addon-resizer:1.0 internal-registry.com/organization/addon-resizer:1.0 -docker push internal-registry.com/organization/addon-resizer:1.0 +docker pull gcr.io/google-containers/addon-resizer-amd64:2.1 +docker tag gcr.io/google-containers/addon-resizer-amd64:2.1 internal-registry.com/organization/addon-resizer:2.1 +docker push internal-registry.com/organization/addon-resizer:2.1 docker pull quay.io/prometheus/alertmanager:v0.15.3 docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 docker push internal-registry.com/organization/alertmanager:v0.15.3 diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index a0dddff2..0f59af4d 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -18,13 +18,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { kubeStateMetrics: 'v1.5.0', kubeRbacProxy: 'v0.4.1', - addonResizer: '1.0', + addonResizer: '2.1', }, imageRepos+:: { kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', - addonResizer: 'quay.io/coreos/addon-resizer', + addonResizer: 'gcr.io/google-containers/addon-resizer-amd64', }, }, @@ -175,7 +175,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, '--memory=' + $._config.kubeStateMetrics.baseMemory, '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, - '--threshold=5', + '--acceptance-offset=5', '--deployment=kube-state-metrics', ]) + container.withEnv([ diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6ceabcca..f7546213 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "986d387aaa6c292c248fc9d31c8b564462bd619e" + "version": "7c4b660febf68b4b0930f761c7c0a992330d3935" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "ae5d0b27229765fc0670c48c09a95cb6da732de3" + "version": "a16ec1d098eb45e7cfb44d7e4b73ba7b766f0e0a" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1fe6f109c87c4fa47775426a6a60c3b954ed5c33" + "version": "3546c4868cec93e1587471b42fd815684a7dd439" } ] } diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 94f7b36a..d6d15672 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -71,7 +71,7 @@ spec: - --extra-cpu=2m - --memory=150Mi - --extra-memory=30Mi - - --threshold=5 + - --acceptance-offset=5 - --deployment=kube-state-metrics env: - name: MY_POD_NAME @@ -84,7 +84,7 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - image: quay.io/coreos/addon-resizer:1.0 + image: gcr.io/google-containers/addon-resizer-amd64:2.1 name: addon-resizer resources: limits: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index a4da3102..e0696ade 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -691,11 +691,11 @@ spec: severity: warning - alert: KubeVersionMismatch annotations: - message: There are {{ $value }} different versions of Kubernetes components - running. + message: There are {{ $value }} different semantic versions of Kubernetes + components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | - count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning From 1e7a6331478f0cf2b9ed380ff88d99028a4b4a03 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 14 Feb 2019 19:43:49 -0200 Subject: [PATCH 592/638] After update libs and regenerate --- jsonnetfile.lock.json | 8 ++--- manifests/grafana-dashboardDefinitions.yaml | 40 ++++++++++----------- manifests/prometheus-rules.yaml | 14 +++++++- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index f7546213..88a61ac8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "7c4b660febf68b4b0930f761c7c0a992330d3935" + "version": "94b30526f43d589912a38193b69dce19b4fa1893" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "a16ec1d098eb45e7cfb44d7e4b73ba7b766f0e0a" + "version": "5525c8cc8a4a52d272bdaf481dd77b53a0c0f051" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "24590e90dcfc9d581c4208b4bb3d15df50f1328e" + "version": "403b7d0120d2903d21854eae217b4e4863c454d1" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "3546c4868cec93e1587471b42fd815684a7dd439" + "version": "4cd0bf8ea846a0d158761d55899f631eb2a423cf" } ] } diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 92fdc805..c4ff59b0 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -60,7 +60,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "expr": "node:cluster_cpu_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -244,7 +244,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_utilisation:ratio", + "expr": "node:cluster_memory_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -856,7 +856,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -908,7 +908,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Cluster", + "title": "Kubernetes / USE Method / Cluster", "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } @@ -1772,7 +1772,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -1851,7 +1851,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Node", + "title": "Kubernetes / USE Method / Node", "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } @@ -3127,7 +3127,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -3179,7 +3179,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Cluster", + "title": "Kubernetes / Compute Resources / Cluster", "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } @@ -3939,7 +3939,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -4018,7 +4018,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Namespace", + "title": "Kubernetes / Compute Resources / Namespace", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } @@ -4778,7 +4778,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -4884,7 +4884,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Pod", + "title": "Kubernetes / Compute Resources / Pod", "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } @@ -6124,7 +6124,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -6202,7 +6202,7 @@ items: ] }, "timezone": "", - "title": "Nodes", + "title": "Kubernetes / Nodes", "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } @@ -6447,7 +6447,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -6551,7 +6551,7 @@ items: ] }, "timezone": "", - "title": "Persistent Volumes", + "title": "Kubernetes / Persistent Volumes", "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 } @@ -6911,7 +6911,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -7041,7 +7041,7 @@ items: ] }, "timezone": "", - "title": "Pods", + "title": "Kubernetes / Pods", "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } @@ -7800,7 +7800,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -7904,7 +7904,7 @@ items: ] }, "timezone": "", - "title": "StatefulSets", + "title": "Kubernetes / StatefulSets", "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index e0696ade..19432b5f 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -136,6 +136,13 @@ spec: * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m + - expr: | + node:node_cpu_utilisation:avg1m + * + node:node_num_cpu:sum + / + scalar(sum(node:node_num_cpu:sum)) + record: node:cluster_cpu_utilisation:ratio - expr: | sum(node_load1{job="node-exporter"}) / @@ -179,8 +186,13 @@ spec: - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / - scalar(sum(node:node_memory_bytes_total:sum)) + node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:cluster_memory_utilisation:ratio - expr: | 1e3 * sum( (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) From d7f4ad2a596bb74d3c809a97da6096d13f724343 Mon Sep 17 00:00:00 2001 From: "GIBSON, NICHOLAS R" Date: Fri, 15 Feb 2019 10:42:33 -0800 Subject: [PATCH 593/638] generated documentation for new thanos-sidecar flags --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index c4ff59b0..140e6b07 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5781,7 +5781,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5872,7 +5872,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6631,21 +6631,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6971,7 +6971,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7138,7 +7138,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7218,7 +7218,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7298,7 +7298,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From 4a2b7cbd0de9168aa50a6eeb5514fde5636fcadf Mon Sep 17 00:00:00 2001 From: "GIBSON, NICHOLAS R" Date: Mon, 18 Feb 2019 13:05:39 -0800 Subject: [PATCH 594/638] fixing doc generation --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 140e6b07..c4ff59b0 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5781,7 +5781,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5872,7 +5872,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6631,21 +6631,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6971,7 +6971,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7138,7 +7138,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7218,7 +7218,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7298,7 +7298,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From d9bb0c2ec4d0e91da73138d80971c5e8d1e602b3 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 20 Feb 2019 12:46:39 +0100 Subject: [PATCH 595/638] Update Prometheus Operator in kube-prometheus to v0.29.0 --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.lock.json | 2 +- ...0alertmanagerCustomResourceDefinition.yaml | 17 +++++------ ...r-0prometheusCustomResourceDefinition.yaml | 30 +++++++++++++------ ...rometheusruleCustomResourceDefinition.yaml | 5 ++-- ...ervicemonitorCustomResourceDefinition.yaml | 2 +- .../0prometheus-operator-deployment.yaml | 4 +-- 7 files changed, 37 insertions(+), 25 deletions(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 7be8827a..5f481c36 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -38,7 +38,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "v0.28.0" + "version": "v0.29.0" }, { "name": "etcd-mixin", diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 88a61ac8..bd343967 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "338addbabc8a29b46840df0bb0355c12b96a6f21" + "version": "7a25bf6b6bb2347dacb235659b73bc210117acc7" }, { "name": "etcd-mixin", diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 89748f1a..821e313d 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1315,8 +1315,7 @@ spec: type: boolean volumeDevices: description: volumeDevices is the list of block devices to be - used by the container. This is an alpha feature and may change - in the future. + used by the container. This is a beta feature. items: description: volumeDevice describes a mapping of a raw block device within a container. @@ -1647,8 +1646,9 @@ spec: set to true. There cannot be more than one managing controller. items: description: OwnerReference contains enough information to let - you identify an owning object. Currently, an owning object must - be in the same namespace, so there is no namespace field. + you identify an owning object. An owning object must be in the + same namespace as the dependent, or be cluster-scoped, so there + is no namespace field. properties: apiVersion: description: API version of the referent. @@ -2116,9 +2116,9 @@ spec: There cannot be more than one managing controller. items: description: OwnerReference contains enough information - to let you identify an owning object. Currently, an - owning object must be in the same namespace, so there - is no namespace field. + to let you identify an owning object. An owning object + must be in the same namespace as the dependent, or be + cluster-scoped, so there is no namespace field. properties: apiVersion: description: API version of the referent. @@ -2263,8 +2263,7 @@ spec: volumeMode: description: volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not - included in claim spec. This is an alpha feature and may - change in the future. + included in claim spec. This is a beta feature. type: string volumeName: description: VolumeName is the binding reference to the diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 158c5cb3..a69cf0a9 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1480,8 +1480,7 @@ spec: type: boolean volumeDevices: description: volumeDevices is the list of block devices to be - used by the container. This is an alpha feature and may change - in the future. + used by the container. This is a beta feature. items: description: volumeDevice describes a mapping of a raw block device within a container. @@ -1829,8 +1828,9 @@ spec: set to true. There cannot be more than one managing controller. items: description: OwnerReference contains enough information to let - you identify an owning object. Currently, an owning object must - be in the same namespace, so there is no namespace field. + you identify an owning object. An owning object must be in the + same namespace as the dependent, or be cluster-scoped, so there + is no namespace field. properties: apiVersion: description: API version of the referent. @@ -2146,6 +2146,10 @@ spec: required: - url type: array + replicaExternalLabelName: + description: Name of Prometheus external label used to denote replica + name. Defaults to the value of `prometheus_replica`. + type: string replicas: description: Number of instances to deploy for a Prometheus deployment. format: int32 @@ -2752,9 +2756,9 @@ spec: There cannot be more than one managing controller. items: description: OwnerReference contains enough information - to let you identify an owning object. Currently, an - owning object must be in the same namespace, so there - is no namespace field. + to let you identify an owning object. An owning object + must be in the same namespace as the dependent, or be + cluster-scoped, so there is no namespace field. properties: apiVersion: description: API version of the referent. @@ -2899,8 +2903,7 @@ spec: volumeMode: description: volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not - included in claim spec. This is an alpha feature and may - change in the future. + included in claim spec. This is a beta feature. type: string volumeName: description: VolumeName is the binding reference to the @@ -2975,6 +2978,10 @@ spec: baseImage: description: Thanos base image if other than default. type: string + clusterAdvertiseAddress: + description: Explicit (external) ip:port address to advertise for + gossip in gossip cluster. Used internally for membership only. + type: string gcs: description: 'Deprecated: ThanosGCSSpec should be configured with an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosGCSSpec @@ -3000,6 +3007,11 @@ spec: type: boolean required: - key + grpcAdvertiseAddress: + description: Explicit (external) host:port address to advertise + for gRPC StoreAPI in gossip cluster. If empty, 'grpc-address' + will be used. + type: string image: description: Image if specified has precedence over baseImage, tag and sha combinations. Specifying the version is still necessary diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index 877fadac..76a78998 100644 --- a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -253,8 +253,9 @@ spec: There cannot be more than one managing controller. items: description: OwnerReference contains enough information to let you - identify an owning object. Currently, an owning object must be in - the same namespace, so there is no namespace field. + identify an owning object. An owning object must be in the same + namespace as the dependent, or be cluster-scoped, so there is no + namespace field. properties: apiVersion: description: API version of the referent. diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index d2e310fd..431bde39 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -138,7 +138,7 @@ spec: type: string relabelings: description: 'RelabelConfigs to apply to samples before ingestion. - More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#' + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 1f880582..f1ace87e 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.28.0 - image: quay.io/coreos/prometheus-operator:v0.28.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.29.0 + image: quay.io/coreos/prometheus-operator:v0.29.0 name: prometheus-operator ports: - containerPort: 8080 From 3536ec550302356b71c241de8d74b1fdcc8b50f4 Mon Sep 17 00:00:00 2001 From: Michael Goodness Date: Wed, 20 Feb 2019 02:36:55 -0600 Subject: [PATCH 596/638] Upgrade prometheus-adapter APIService to v1 Also remove unnecessary namespace from prometheus-adapter ClusterRoleBinding Signed-off-by: Michael Goodness --- .../prometheus-adapter/prometheus-adapter.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 8624c94a..079f9fb7 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -50,7 +50,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; prometheusAdapter+:: { apiService: { - apiVersion: 'apiregistration.k8s.io/v1beta1', + apiVersion: 'apiregistration.k8s.io/v1', kind: 'APIService', metadata: { name: 'v1beta1.metrics.k8s.io', @@ -147,7 +147,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; clusterRoleBinding.new() + clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) + - clusterRoleBinding.mixin.metadata.withNamespace($._config.namespace) + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) + clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + From 0cfb5d8a9a8130b68958a63afc86f253e76853af Mon Sep 17 00:00:00 2001 From: Michael Goodness Date: Thu, 21 Feb 2019 11:39:20 -0600 Subject: [PATCH 597/638] Add generated artifacts Signed-off-by: Michael Goodness --- jsonnetfile.lock.json | 8 +- manifests/grafana-dashboardDefinitions.yaml | 208 ++++++++++++++++-- manifests/prometheus-adapter-apiService.yaml | 2 +- ...prometheus-adapter-clusterRoleBinding.yaml | 1 - 4 files changed, 198 insertions(+), 21 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 88a61ac8..1150a138 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "94b30526f43d589912a38193b69dce19b4fa1893" + "version": "a4cd74c5906273ea2c38a2b728641cdef017c76c" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "5525c8cc8a4a52d272bdaf481dd77b53a0c0f051" + "version": "ccb787a44f2ebdecbb346d57490fa7e49981b323" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "403b7d0120d2903d21854eae217b4e4863c454d1" + "version": "bbf03a7971ebac7011ef9320fcc23cc01e0a54d3" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "4cd0bf8ea846a0d158761d55899f631eb2a423cf" + "version": "8c228d692bfa516e1c0977922f061b9a0fb1ae0f" } ] } diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index c4ff59b0..d8bf4633 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2831,7 +2831,7 @@ items: }, "yaxes": [ { - "format": "decbytes", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -2919,7 +2919,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests", @@ -2937,7 +2937,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests %", @@ -2973,7 +2973,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Limits %", @@ -3625,7 +3625,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (w/o cache)", "tooltip": { "shared": true, "sort": 0, @@ -3643,7 +3643,7 @@ items: }, "yaxes": [ { - "format": "decbytes", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -3731,7 +3731,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests", @@ -3749,7 +3749,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests %", @@ -3785,7 +3785,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Limits %", @@ -3805,6 +3805,60 @@ items: "type": "number", "unit": "percentunit" }, + { + "alias": "Memory Usage (RSS)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Usage (Cache)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Usage (Swap", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, { "alias": "Pod", "colorMode": null, @@ -3884,6 +3938,33 @@ items: "legendFormat": "", "refId": "E", "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "H", + "step": 10 } ], "thresholds": [ @@ -4451,10 +4532,26 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container_name}}", + "legendFormat": "{{container_name}} (RSS)", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}} (Cache)", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}} (Swap)", "legendLink": null, "step": 10 } @@ -4482,7 +4579,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -4570,7 +4667,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests", @@ -4588,7 +4685,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Requests %", @@ -4624,7 +4721,7 @@ items: ], "type": "number", - "unit": "decbytes" + "unit": "bytes" }, { "alias": "Memory Limits %", @@ -4644,6 +4741,60 @@ items: "type": "number", "unit": "percentunit" }, + { + "alias": "Memory Usage (RSS)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Usage (Cache)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Usage (Swap", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, { "alias": "Container", "colorMode": null, @@ -4723,6 +4874,33 @@ items: "legendFormat": "", "refId": "E", "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "H", + "step": 10 } ], "thresholds": [ diff --git a/manifests/prometheus-adapter-apiService.yaml b/manifests/prometheus-adapter-apiService.yaml index 95d5c32d..a215efe4 100644 --- a/manifests/prometheus-adapter-apiService.yaml +++ b/manifests/prometheus-adapter-apiService.yaml @@ -1,4 +1,4 @@ -apiVersion: apiregistration.k8s.io/v1beta1 +apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: name: v1beta1.metrics.k8s.io diff --git a/manifests/prometheus-adapter-clusterRoleBinding.yaml b/manifests/prometheus-adapter-clusterRoleBinding.yaml index 29fa9176..7e8f3da9 100644 --- a/manifests/prometheus-adapter-clusterRoleBinding.yaml +++ b/manifests/prometheus-adapter-clusterRoleBinding.yaml @@ -2,7 +2,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus-adapter - namespace: monitoring roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole From f0438e7f80d1d19d2953eff382f268db01cc3eef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 22 Feb 2019 18:39:19 +0100 Subject: [PATCH 598/638] kube-prometheus: node-exporter tolerates any node --- .../node-exporter/node-exporter.libsonnet | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 93c90a1f..6b7f7f8a 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -66,9 +66,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local podLabels = { app: 'node-exporter' }; - local masterToleration = toleration.new() + - toleration.withEffect('NoSchedule') + - toleration.withKey('node-role.kubernetes.io/master'); + local noExecuteToleration = toleration.new() + + toleration.withOperator('Exists') + + toleration.withEffect('NoExecute'); + + local noScheduleToleration = toleration.new() + + toleration.withOperator('Exists') + + toleration.withEffect('NoSchedule'); local procVolumeName = 'proc'; local procVolume = volume.fromHostPath(procVolumeName, '/proc'); @@ -132,7 +136,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; daemonset.mixin.metadata.withLabels(podLabels) + daemonset.mixin.spec.selector.withMatchLabels(podLabels) + daemonset.mixin.spec.template.metadata.withLabels(podLabels) + - daemonset.mixin.spec.template.spec.withTolerations([masterToleration]) + + daemonset.mixin.spec.template.spec.withTolerations([noExecuteToleration, noScheduleToleration]) + daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + daemonset.mixin.spec.template.spec.withContainers(c) + daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) + From 872fd4558e781548d82701b1f6cf5f0bfe0b38a4 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 25 Feb 2019 10:01:05 +0100 Subject: [PATCH 599/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 6 +++--- manifests/node-exporter-daemonset.yaml | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b487f059..57b903f7 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "a4cd74c5906273ea2c38a2b728641cdef017c76c" + "version": "8191e30cccc28c54b2386aab1b685396bf1ed4ba" }, { "name": "ksonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "bbf03a7971ebac7011ef9320fcc23cc01e0a54d3" + "version": "5d7e5391010c768a6ddd39163c35662f379e20ca" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "8c228d692bfa516e1c0977922f061b9a0fb1ae0f" + "version": "5effa154b464faa6a9ca88296df831eb7f0b8955" } ] } diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index e8ea15f1..f0729b5d 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -74,8 +74,10 @@ spec: runAsUser: 65534 serviceAccountName: node-exporter tolerations: + - effect: NoExecute + operator: Exists - effect: NoSchedule - key: node-role.kubernetes.io/master + operator: Exists volumes: - hostPath: path: /proc From 03d36217d04e396f643e42b76d7ce10c4d488acb Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 25 Feb 2019 14:21:44 +0100 Subject: [PATCH 600/638] kube-prometheus: Use node-exporter metrics for resource metrics API --- .../prometheus-adapter/prometheus-adapter.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 079f9fb7..193a36a5 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -20,7 +20,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; resourceRules: cpu: containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) - nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: node: @@ -32,7 +32,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; containerLabel: container_name memory: containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) - nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: node: From be2557814d6917f7f4ce6cab7a52a16ab759a03b Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 26 Feb 2019 17:45:04 +0100 Subject: [PATCH 601/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 4 ++-- manifests/prometheus-adapter-configMap.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 57b903f7..5d10866c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "8191e30cccc28c54b2386aab1b685396bf1ed4ba" + "version": "df002d09f7b7a50321786c4f19c70d371494410b" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "5effa154b464faa6a9ca88296df831eb7f0b8955" + "version": "a7e3bd06b2ef0286e1571836997287a81146c25a" } ] } diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml index a231de36..d6ebd78f 100644 --- a/manifests/prometheus-adapter-configMap.yaml +++ b/manifests/prometheus-adapter-configMap.yaml @@ -4,7 +4,7 @@ data: resourceRules: cpu: containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) - nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: node: @@ -16,7 +16,7 @@ data: containerLabel: container_name memory: containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) - nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: node: From 71976ec1f2bb93b82f48ab31b7b1a82c4a70b0a6 Mon Sep 17 00:00:00 2001 From: Joy Bhattacherjee Date: Thu, 28 Feb 2019 15:01:06 +0530 Subject: [PATCH 602/638] Makes thanos downsampled data browsable from grafana using query.auto-downsampling --- jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index cc6ec460..c99bf38b 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -83,6 +83,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; 'query', '--log.level=debug', '--query.replica-label=prometheus_replica', + '--query.auto-downsampling', '--cluster.peers=thanos-peers.' + $._config.namespace + '.svc:10900', ]); local podLabels = { app: 'thanos-query', 'thanos-peers': 'true' }; From 7e594084f21214307a7ed78dcb1a873ae4eec414 Mon Sep 17 00:00:00 2001 From: Akash Nair Date: Fri, 1 Mar 2019 16:04:02 +0100 Subject: [PATCH 603/638] Changed kubectl apply -f to kubectl create -f. kubectl apply command leads to conflicts and race conditions. kubectl create works the first time you run it --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1dec2c2b..b612e0c4 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This project is intended to be used as a library (i.e. the intent is not for you Though for a quickstart a compiled version of the Kubernetes [manifests](manifests) generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run: * Simply create the stack: ``` -$ kubectl apply -f manifests/ +$ kubectl create -f manifests/ # It can take a few seconds for the above 'create manifests' command to fully create the following resources, so verify the resources are ready before proceeding. $ until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done From 67b2c29875c5ecc8d15020266f8933f69bea6bcd Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 1 Mar 2019 18:13:11 +0100 Subject: [PATCH 604/638] kube-prometheus/docs: Update how to handle existing rules --- ...prometheus-rules-and-grafana-dashboards.md | 23 +++++++++++-------- examples/existingrule.json | 1 + .../{example.rules.yaml => existingrule.yaml} | 0 ...s-additional-rendered-rule-example.jsonnet | 9 +------- 4 files changed, 16 insertions(+), 17 deletions(-) create mode 100644 examples/existingrule.json rename examples/{example.rules.yaml => existingrule.yaml} (100%) diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index a0c1ff76..dcd0ad72 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -111,19 +111,24 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { ### Pre-rendered rules -We acknowledge, that users may need to transition existing rules, and therefore allow an option to add additional pre-rendered rules. This can be done simply by importing the existing rules in the [Prometheus rule format](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) using the jsonnet function `importstr`. In this example we are importing a [provided example rule](../examples/example.rules.yaml). +We acknowledge, that users may need to transition existing rules, and therefore allow an option to add additional pre-rendered rules. Luckily the yaml and json formats are very close so the yaml rules just need to be converted to json without any manual interaction needed. Just a tool to convert yaml to json is needed: + +``` +go get -u -v github.com/brancz/gojsontoyaml +``` + +And convert the existing rule file: + +``` +cat existingrule.yaml | gojsontoyaml -yamltojson > existingrule.json +``` + +Then import it in jsonnet: [embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet) ```jsonnet local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - prometheus+:: { - renderedRules: { - 'example.rules.yaml': (importstr 'example.rules.yaml'), - }, - }, - }, + prometheusAlerts+:: (import 'existingrule.json'), }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + diff --git a/examples/existingrule.json b/examples/existingrule.json new file mode 100644 index 00000000..b29a5c45 --- /dev/null +++ b/examples/existingrule.json @@ -0,0 +1 @@ +{"groups":[{"name":"example-group","rules":[{"alert":"DeadMansSwitch","annotations":{"description":"This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file diff --git a/examples/example.rules.yaml b/examples/existingrule.yaml similarity index 100% rename from examples/example.rules.yaml rename to examples/existingrule.yaml diff --git a/examples/prometheus-additional-rendered-rule-example.jsonnet b/examples/prometheus-additional-rendered-rule-example.jsonnet index 4ee7317d..07ef0e50 100644 --- a/examples/prometheus-additional-rendered-rule-example.jsonnet +++ b/examples/prometheus-additional-rendered-rule-example.jsonnet @@ -1,12 +1,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - prometheus+:: { - renderedRules: { - 'example.rules.yaml': (importstr 'example.rules.yaml'), - }, - }, - }, + prometheusAlerts+:: (import 'existingrule.json'), }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + From ea5c455c467b6b023d0d7549f46f3afe32dcf083 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Thu, 7 Mar 2019 08:24:22 +1300 Subject: [PATCH 605/638] contrib/kube-prometheus: Jsonnet snippet for managed kubernetes clusters --- .../kube-prometheus-managed-cluster.jsonnet | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet diff --git a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet b/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet new file mode 100644 index 00000000..442c0261 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet @@ -0,0 +1,28 @@ +// On managed Kubernetes clusters some of the control plane components are not exposed to customers. +// Disable scrape jobs and service monitors for these components by overwriting 'kube-prometheus.libsonnet' defaults +// Note this doesn't disable generation of associated alerting rules but the rules don't trigger + +{ + _config+:: { + // This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object + // excluding any members of the set specified (eg: controller and scheduler). + local j = super.jobs, + jobs: { + [k]: j[k] + for k in std.objectFields(j) + if !std.setMember(k, ['KubeControllerManager', 'KubeScheduler']) + }, + }, + + // Same as above but for ServiceMonitor's + local p = super.prometheus, + prometheus: { + [q]: p[q] + for q in std.objectFields(p) + if !std.setMember(q, ['serviceMonitorKubeControllerManager', 'serviceMonitorKubeScheduler']) + }, + + // TODO: disable generationg of alerting rules + // manifests/prometheus-rules.yaml:52: - name: kube-scheduler.rules + +} From eb3ba15cffe7136f1668bca9532e54be3fc336df Mon Sep 17 00:00:00 2001 From: "Jessie A. Morris" Date: Thu, 7 Mar 2019 15:57:45 -0700 Subject: [PATCH 606/638] Update jssonnet files --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index d8bf4633..20f66dd4 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5959,7 +5959,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6050,7 +6050,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6809,21 +6809,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -7149,7 +7149,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7316,7 +7316,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7396,7 +7396,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7476,7 +7476,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From ab8e168a12f3634abd663afcb10acebb0915cee6 Mon Sep 17 00:00:00 2001 From: "Jessie A. Morris" Date: Thu, 7 Mar 2019 16:33:19 -0700 Subject: [PATCH 607/638] Actually commit the right jsonnet changes --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 20f66dd4..d8bf4633 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -5959,7 +5959,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6050,7 +6050,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6809,21 +6809,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -7149,7 +7149,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7316,7 +7316,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7396,7 +7396,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7476,7 +7476,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From ea24858666513a4460ad54ce17fdfca6c528c54f Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 7 Mar 2019 18:44:02 -0600 Subject: [PATCH 608/638] contrib/kube-prometheus: fix GKE cAdvisor readme link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b612e0c4..9a78a2ef 100644 --- a/README.md +++ b/README.md @@ -581,7 +581,7 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. -If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md]. +If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md). #### Authentication problem From 78bec4e48b6ff09c1127931ad7930d9e5ed6ee94 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 10 Mar 2019 14:50:09 -0700 Subject: [PATCH 609/638] Fix PrometheusTSDBWALCorruptions alert * `tsdb_wal_corruptions_total metric` name should be `prometheus_tsdb_wal_corruptions_total` --- jsonnet/kube-prometheus/alerts/prometheus.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet index b188faa2..a2d0cc67 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -109,7 +109,7 @@ summary: 'Prometheus write-ahead log is corrupted', }, expr: ||| - tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 ||| % $._config, 'for': '4h', labels: { From 8f182a818e9d8388ba855ed0ab88f113f1ba8731 Mon Sep 17 00:00:00 2001 From: Wesley Cranston Date: Mon, 11 Mar 2019 22:56:09 -0400 Subject: [PATCH 610/638] Update developing-prometheus-rules-and-grafana-dashboards.md From the discussion brought up in this [issue](https://github.com/coreos/prometheus-operator/issues/2250), a new section has been added to the doc. --- ...prometheus-rules-and-grafana-dashboards.md | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index dcd0ad72..264e4f04 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -139,7 +139,75 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` +### Changing default rules +Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). + +Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above. + +#### Filter +Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +```jsonnet +local filter = { + prometheusAlerts+:: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter(function(rule) + rule.alert != "KubeStatefulSetReplicasMismatch", + group.rules + ) + } + else + group, + super.groups + ), + }, +}; +``` +#### Adjustment +Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +```jsonnet +local update = { + prometheusAlerts+:: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == "KubeStatefulSetReplicasMismatch" then + rule { + expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}" + } + else + rule, + group.rules + ) + } + else + group, + super.groups + ), + }, +}; +``` +Using the example from above about adding in pre-rendered rules, the new local vaiables can be added in as follows: +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + { + prometheusAlerts+:: (import 'existingrule.json'), +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` ## Dashboards Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard. From 3fbc968930c2fef025470c59060d25635070f8e7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 11 Mar 2019 13:26:36 +0100 Subject: [PATCH 611/638] contrib/kube-prometheus: Update Prometheus, Thanos & Grafana --- jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet | 2 +- jsonnet/kube-prometheus/kube-prometheus.libsonnet | 2 +- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index c99bf38b..4b20b814 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; { _config+:: { versions+:: { - thanos: 'v0.2.1', + thanos: 'v0.3.2', }, imageRepos+:: { thanos: 'improbable/thanos', diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index c30f13f9..f51ae49a 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -43,7 +43,7 @@ local configMapList = k.core.v1.configMapList; namespace: 'default', versions+:: { - grafana: '6.0.0-beta1', + grafana: '6.0.1', }, tlsCipherSuites: [ diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 453ab79c..ba2adb05 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.5.0', + prometheus: 'v2.7.2', }, imageRepos+:: { From 7750915d2886c589f79a63ed76cf2f0be7b4665b Mon Sep 17 00:00:00 2001 From: deepakhj Date: Tue, 12 Mar 2019 20:09:06 -0700 Subject: [PATCH 612/638] Update kube-dns-prometheus-discovery metrics port label to match ServiceMonitor. --- jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet index a9cf3bb3..556fa856 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet @@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + service.mixin.spec.withClusterIp('None'), kubeDnsPrometheusDiscoveryService: - service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + service.mixin.metadata.withNamespace('kube-system') + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + service.mixin.spec.withClusterIp('None'), From 5ed3fb06c9a7674ed3c4d6f41740ca376918eb9c Mon Sep 17 00:00:00 2001 From: goll Date: Wed, 13 Mar 2019 20:21:36 +0100 Subject: [PATCH 613/638] Use suggested addon-resizer 1.8.4 --- README.md | 20 +++++++++---------- .../alertmanager/alertmanager.libsonnet | 2 +- .../kube-prometheus/alerts/general.libsonnet | 4 ++-- .../kube-state-metrics.libsonnet | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 9a78a2ef..79e7f929 100644 --- a/README.md +++ b/README.md @@ -260,12 +260,12 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.16.0", + alertmanager: "v0.16.1", nodeExporter: "v0.17.0", kubeStateMetrics: "v1.5.0", kubeRbacProxy: "v0.4.1", - addonResizer: "2.1", - prometheusOperator: "v0.28.0", + addonResizer: "1.8.4", + prometheusOperator: "v0.29.0", prometheus: "v2.5.0", }, @@ -274,7 +274,7 @@ These are the available fields with their respective default values: alertmanager: "quay.io/prometheus/alertmanager", kubeStateMetrics: "quay.io/coreos/kube-state-metrics", kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - addonResizer: "gcr.io/google-containers/addon-resizer-amd64", + addonResizer: "k8s.gcr.io/addon-resizer", nodeExporter: "quay.io/prometheus/node-exporter", prometheusOperator: "quay.io/coreos/prometheus-operator", }, @@ -402,12 +402,12 @@ To produce the `docker pull/tag/push` commands that will synchronize upstream im ```shell $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet -docker pull gcr.io/google-containers/addon-resizer-amd64:2.1 -docker tag gcr.io/google-containers/addon-resizer-amd64:2.1 internal-registry.com/organization/addon-resizer:2.1 -docker push internal-registry.com/organization/addon-resizer:2.1 -docker pull quay.io/prometheus/alertmanager:v0.15.3 -docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 -docker push internal-registry.com/organization/alertmanager:v0.15.3 +docker pull k8s.gcr.io/addon-resizer:1.8.4 +docker tag k8s.gcr.io/addon-resizer:1.8.4 internal-registry.com/organization/addon-resizer:1.8.4 +docker push internal-registry.com/organization/addon-resizer:1.8.4 +docker pull quay.io/prometheus/alertmanager:v0.16.1 +docker tag quay.io/prometheus/alertmanager:v0.16.1 internal-registry.com/organization/alertmanager:v0.16.1 +docker push internal-registry.com/organization/alertmanager:v0.16.1 ... ``` diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 02909525..6d5525e1 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.16.0', + alertmanager: 'v0.16.1', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index 8802097e..8705389f 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -16,14 +16,14 @@ }, }, { - alert: 'Watchdog', + alert: 'DeadMansSwitch', annotations: { message: ||| This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. + "DeadMansSwitch" integration in PagerDuty. |||, }, expr: 'vector(1)', diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 0f59af4d..5172fb94 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -18,13 +18,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { kubeStateMetrics: 'v1.5.0', kubeRbacProxy: 'v0.4.1', - addonResizer: '2.1', + addonResizer: '1.8.4', }, imageRepos+:: { kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', - addonResizer: 'gcr.io/google-containers/addon-resizer-amd64', + addonResizer: 'k8s.gcr.io/addon-resizer', }, }, @@ -175,7 +175,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, '--memory=' + $._config.kubeStateMetrics.baseMemory, '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, - '--acceptance-offset=5', + '--threshold=5', '--deployment=kube-state-metrics', ]) + container.withEnv([ From 083f66cb4668f03c9f4dce20891e5d4f99d1126d Mon Sep 17 00:00:00 2001 From: goll Date: Wed, 13 Mar 2019 20:51:35 +0100 Subject: [PATCH 614/638] make clean generate-in-docker for addon-resizer --- jsonnetfile.lock.json | 8 +- manifests/alertmanager-alertmanager.yaml | 2 +- manifests/grafana-dashboardDefinitions.yaml | 616 +++++++++++++------ manifests/kube-state-metrics-deployment.yaml | 4 +- manifests/prometheus-rules.yaml | 46 +- 5 files changed, 486 insertions(+), 190 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5d10866c..18299a0a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "df002d09f7b7a50321786c4f19c70d371494410b" + "version": "a4aafd81b86428bd445c5fb86c8da5c8e73cbbbc" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "ccb787a44f2ebdecbb346d57490fa7e49981b323" + "version": "9069b2c1be0ce32f63f9a01c4a4f8d69bc4e37d5" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "5d7e5391010c768a6ddd39163c35662f379e20ca" + "version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a7e3bd06b2ef0286e1571836997287a81146c25a" + "version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1" } ] } diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 376c17ba..93c52a85 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -15,4 +15,4 @@ spec: runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.16.0 + version: v0.16.1 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index d8bf4633..2086bfaa 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -60,7 +60,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:cluster_cpu_utilisation:ratio", + "expr": "node:cluster_cpu_utilisation:ratio{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -75,7 +75,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -146,7 +146,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", + "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\"} / scalar(sum(min(kube_pod_info{cluster=\"$cluster\"}) by (node)))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -161,7 +161,7 @@ items: "timeShift": null, "title": "CPU Saturation (Load1)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -244,7 +244,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:cluster_memory_utilisation:ratio", + "expr": "node:cluster_memory_utilisation:ratio{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -259,7 +259,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -330,7 +330,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_swap_io_bytes:sum_rate", + "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -345,7 +345,7 @@ items: "timeShift": null, "title": "Memory Saturation (Swap I/O)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -428,7 +428,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", + "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -443,7 +443,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -514,7 +514,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", + "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -529,7 +529,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -612,7 +612,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_utilisation:sum_irate", + "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -627,7 +627,7 @@ items: "timeShift": null, "title": "Net Utilisation (Transmitted)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -698,7 +698,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_saturation:sum_irate", + "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -713,7 +713,7 @@ items: "timeShift": null, "title": "Net Saturation (Dropped)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -811,7 +811,7 @@ items: "timeShift": null, "title": "Disk Capacity", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -875,6 +875,33 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_node_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -976,7 +1003,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", + "expr": "node:node_cpu_utilisation:avg1m{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -991,7 +1018,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1062,7 +1089,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", + "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1077,7 +1104,7 @@ items: "timeShift": null, "title": "CPU Saturation (Load1)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1160,7 +1187,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_utilisation:{node=\"$node\"}", + "expr": "node:node_memory_utilisation:{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Memory", @@ -1175,7 +1202,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1246,7 +1273,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", + "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Swap IO", @@ -1261,7 +1288,7 @@ items: "timeShift": null, "title": "Memory Saturation (Swap I/O)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1344,7 +1371,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", + "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -1359,7 +1386,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1430,7 +1457,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", + "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1445,7 +1472,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1528,7 +1555,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", + "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -1543,7 +1570,7 @@ items: "timeShift": null, "title": "Net Utilisation (Transmitted)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1614,7 +1641,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", + "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1629,7 +1656,7 @@ items: "timeShift": null, "title": "Net Saturation (Dropped)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1712,7 +1739,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_filesystem_usage:\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", + "expr": "node:node_filesystem_usage:{cluster=\"$cluster\"}\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\", node=\"$node\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -1727,7 +1754,7 @@ items: "timeShift": null, "title": "Disk Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1792,6 +1819,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_node_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -1807,7 +1861,7 @@ items: "options": [ ], - "query": "label_values(kube_node_info, node)", + "query": "label_values(kube_node_info{cluster=\"$cluster\"}, node)", "refresh": 1, "regex": "", "sort": 2, @@ -1920,7 +1974,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -1932,7 +1986,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2004,7 +2058,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2016,7 +2070,7 @@ items: "timeShift": null, "title": "CPU Requests Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2088,7 +2142,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2100,7 +2154,7 @@ items: "timeShift": null, "title": "CPU Limits Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2172,7 +2226,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2184,7 +2238,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2256,7 +2310,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2268,7 +2322,7 @@ items: "timeShift": null, "title": "Memory Requests Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2340,7 +2394,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2352,7 +2406,7 @@ items: "timeShift": null, "title": "Memory Limits Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2435,7 +2489,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2450,7 +2504,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2638,7 +2692,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2664,7 +2718,7 @@ items: ], "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2673,7 +2727,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2682,7 +2736,7 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2691,7 +2745,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2700,7 +2754,7 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2716,7 +2770,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2800,7 +2854,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2815,7 +2869,7 @@ items: "timeShift": null, "title": "Memory Usage (w/o cache)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3003,7 +3057,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3029,7 +3083,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3038,7 +3092,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3047,7 +3101,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3056,7 +3110,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3065,7 +3119,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3081,7 +3135,7 @@ items: "timeShift": null, "title": "Requests by Namespace", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3146,6 +3200,33 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_cpu_seconds_total, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -3247,7 +3328,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}) by (pod_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3262,7 +3343,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3450,7 +3531,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3476,7 +3557,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3485,7 +3566,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3494,7 +3575,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3503,7 +3584,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3512,7 +3593,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3528,7 +3609,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3612,7 +3693,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", + "expr": "sum(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3627,7 +3708,7 @@ items: "timeShift": null, "title": "Memory Usage (w/o cache)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3869,7 +3950,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3895,7 +3976,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3904,7 +3985,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3913,7 +3994,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3922,7 +4003,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3931,7 +4012,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3940,7 +4021,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3949,7 +4030,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3958,7 +4039,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3974,7 +4055,7 @@ items: "timeShift": null, "title": "Memory Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4040,6 +4121,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -4055,7 +4163,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", "sort": 2, @@ -4167,7 +4275,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", cluster=\"$cluster\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4182,7 +4290,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4396,7 +4504,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4405,7 +4513,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4414,7 +4522,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4423,7 +4531,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4432,7 +4540,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4448,7 +4556,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4532,7 +4640,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (RSS)", @@ -4540,7 +4648,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (Cache)", @@ -4548,7 +4656,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (Swap)", @@ -4563,7 +4671,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4831,7 +4939,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4840,7 +4948,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4849,7 +4957,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4858,7 +4966,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4867,7 +4975,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4876,7 +4984,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4885,7 +4993,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4894,7 +5002,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4910,7 +5018,7 @@ items: "timeShift": null, "title": "Memory Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4976,6 +5084,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -4991,7 +5126,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", "sort": 2, @@ -5018,7 +5153,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", "refresh": 1, "regex": "", "sort": 2, @@ -5143,21 +5278,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load1{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 1m", "refId": "A" }, { - "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load5{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 5m", "refId": "B" }, { - "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load15{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 15m", @@ -5171,7 +5306,7 @@ items: "timeShift": null, "title": "System load", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5248,7 +5383,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", + "expr": "sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cpu}}", @@ -5262,7 +5397,7 @@ items: "timeShift": null, "title": "Usage Per Core", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5352,7 +5487,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{ cpu }}", @@ -5366,7 +5501,7 @@ items: "timeShift": null, "title": "CPU Utilizaion", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5460,7 +5595,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -5469,6 +5604,9 @@ items: ], "thresholds": "80, 90", "title": "CPU Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5537,28 +5675,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5572,7 +5710,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5666,7 +5804,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -5675,6 +5813,9 @@ items: ], "thresholds": "80, 90", "title": "Memory Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5750,21 +5891,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_read_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_written_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_io_time_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5778,7 +5919,7 @@ items: "timeShift": null, "title": "Disk I/O", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5855,7 +5996,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_filesystem_usage:\n", + "expr": "node:node_filesystem_usage:{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5869,7 +6010,7 @@ items: "timeShift": null, "title": "Disk Space Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5959,7 +6100,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5973,7 +6114,7 @@ items: "timeShift": null, "title": "Network Received", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6050,7 +6191,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6064,7 +6205,7 @@ items: "timeShift": null, "title": "Network Transmitted", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6154,14 +6295,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "inodes used", "refId": "A" }, { - "expr": "max(node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "inodes free", @@ -6175,7 +6316,7 @@ items: "timeShift": null, "title": "Inodes Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6269,7 +6410,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -6278,6 +6419,9 @@ items: ], "thresholds": "80, 90", "title": "Inodes Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -6326,6 +6470,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -6336,7 +6506,7 @@ items: "options": [ ], - "query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)", + "query": "label_values(node_boot_time_seconds{cluster=\"$cluster\", job=\"node-exporter\"}, instance)", "refresh": 2, "regex": "", "sort": 0, @@ -6461,7 +6631,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "expr": "(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ Usage }}", @@ -6475,7 +6645,7 @@ items: "timeShift": null, "title": "Volume Space Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6565,7 +6735,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "kubelet_volume_stats_inodes_used{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ Usage }}", @@ -6579,7 +6749,7 @@ items: "timeShift": null, "title": "Volume inodes Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6649,6 +6819,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -6659,7 +6855,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}, exported_namespace)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, exported_namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -6685,7 +6881,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", "refresh": 2, "regex": "", "sort": 0, @@ -6805,25 +7001,26 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6837,7 +7034,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6922,11 +7119,12 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6940,7 +7138,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -7025,11 +7223,12 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", @@ -7043,7 +7242,7 @@ items: "timeShift": null, "title": "Network I/O", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -7113,6 +7312,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -7123,7 +7348,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -7149,7 +7374,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\u007e\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7175,7 +7400,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "query": "label_values(kube_pod_container_info{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}, container)", "refresh": 2, "regex": "", "sort": 0, @@ -7316,7 +7541,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7325,6 +7550,9 @@ items: ], "thresholds": "", "title": "CPU", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7396,7 +7624,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7405,6 +7633,9 @@ items: ], "thresholds": "", "title": "Memory", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7476,7 +7707,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7485,6 +7716,9 @@ items: ], "thresholds": "", "title": "Network", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7571,7 +7805,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7580,6 +7814,9 @@ items: ], "thresholds": "", "title": "Desired Replicas", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7652,7 +7889,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7661,6 +7898,9 @@ items: ], "thresholds": "", "title": "Replicas of current version", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7733,7 +7973,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7742,6 +7982,9 @@ items: ], "thresholds": "", "title": "Observed Generation", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7814,7 +8057,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7823,6 +8066,9 @@ items: ], "thresholds": "", "title": "Metadata Generation", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7890,35 +8136,35 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas specified", "refId": "A" }, { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas created", "refId": "B" }, { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "ready", "refId": "C" }, { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas of current version", "refId": "D" }, { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "updated", @@ -7932,7 +8178,7 @@ items: "timeShift": null, "title": "Replicas", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -8002,6 +8248,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index d6d15672..d57ea12d 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -71,7 +71,7 @@ spec: - --extra-cpu=2m - --memory=150Mi - --extra-memory=30Mi - - --acceptance-offset=5 + - --threshold=5 - --deployment=kube-state-metrics env: - name: MY_POD_NAME @@ -84,7 +84,7 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - image: gcr.io/google-containers/addon-resizer-amd64:2.1 + image: k8s.gcr.io/addon-resizer:1.8.4 name: addon-resizer resources: limits: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 19432b5f..874520f6 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -225,21 +225,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -769,9 +769,9 @@ spec: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical @@ -780,9 +780,33 @@ spec: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 for: 10m labels: severity: warning @@ -843,14 +867,14 @@ spec: for: 10m labels: severity: warning - - alert: Watchdog + - alert: DeadMansSwitch annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. + "DeadMansSwitch" integration in PagerDuty. expr: vector(1) labels: severity: none @@ -951,7 +975,7 @@ spec: log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | - tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 + prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 for: 4h labels: severity: warning From ef539c398b023245e00f83af7ec3b45898b25c1a Mon Sep 17 00:00:00 2001 From: goll Date: Fri, 15 Mar 2019 15:37:42 +0100 Subject: [PATCH 615/638] Rename DeadMansSwitch to Watchdog --- README.md | 4 ++-- docs/developing-prometheus-rules-and-grafana-dashboards.md | 4 ++-- examples/alertmanager-config.jsonnet | 2 +- examples/alertmanager-config.yaml | 2 +- examples/existingrule.json | 2 +- examples/existingrule.yaml | 4 ++-- examples/prometheus-additional-alert-rule-example.jsonnet | 4 ++-- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 2 +- jsonnet/kube-prometheus/alerts/general.libsonnet | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 79e7f929..bc320b79 100644 --- a/README.md +++ b/README.md @@ -298,7 +298,7 @@ These are the available fields with their respective default values: receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' @@ -497,7 +497,7 @@ The Alertmanager configuration is located in the `_config.alertmanager.config` c receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index dcd0ad72..d6c74999 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -49,13 +49,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { name: 'example-group', rules: [ { - alert: 'DeadMansSwitch', + alert: 'Watchdog', expr: 'vector(1)', labels: { severity: 'none', }, annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', }, }, ], diff --git a/examples/alertmanager-config.jsonnet b/examples/alertmanager-config.jsonnet index 162104d7..f08dbe19 100644 --- a/examples/alertmanager-config.jsonnet +++ b/examples/alertmanager-config.jsonnet @@ -12,7 +12,7 @@ receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' diff --git a/examples/alertmanager-config.yaml b/examples/alertmanager-config.yaml index 78c65b64..b341b55f 100644 --- a/examples/alertmanager-config.yaml +++ b/examples/alertmanager-config.yaml @@ -9,7 +9,7 @@ route: receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' diff --git a/examples/existingrule.json b/examples/existingrule.json index b29a5c45..41d6620b 100644 --- a/examples/existingrule.json +++ b/examples/existingrule.json @@ -1 +1 @@ -{"groups":[{"name":"example-group","rules":[{"alert":"DeadMansSwitch","annotations":{"description":"This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file +{"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file diff --git a/examples/existingrule.yaml b/examples/existingrule.yaml index 94d9d691..6a67032f 100644 --- a/examples/existingrule.yaml +++ b/examples/existingrule.yaml @@ -1,9 +1,9 @@ groups: - name: example-group rules: - - alert: DeadMansSwitch + - alert: Watchdog expr: vector(1) labels: severity: "none" annotations: - description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. + description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional. diff --git a/examples/prometheus-additional-alert-rule-example.jsonnet b/examples/prometheus-additional-alert-rule-example.jsonnet index b8d16af8..622df032 100644 --- a/examples/prometheus-additional-alert-rule-example.jsonnet +++ b/examples/prometheus-additional-alert-rule-example.jsonnet @@ -8,13 +8,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { name: 'example-group', rules: [ { - alert: 'DeadMansSwitch', + alert: 'Watchdog', expr: 'vector(1)', labels: { severity: 'none', }, annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', }, }, ], diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 6d5525e1..47e61e29 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -28,7 +28,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { receiver: 'null', match: { - alertname: 'DeadMansSwitch', + alertname: 'Watchdog', }, }, ], diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index 8705389f..24a4dfd5 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -16,7 +16,7 @@ }, }, { - alert: 'DeadMansSwitch', + alert: 'Watchdog', annotations: { message: ||| This is an alert meant to ensure that the entire alerting pipeline is functional. From bde9e7fd4ae8f5ccb61bba0cb6255fed98aedf94 Mon Sep 17 00:00:00 2001 From: goll Date: Fri, 15 Mar 2019 16:01:57 +0100 Subject: [PATCH 616/638] generate-in-docker Rename DeadMansSwitch to Watchdog --- jsonnetfile.lock.json | 2 +- manifests/alertmanager-secret.yaml | 2 +- manifests/prometheus-rules.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 18299a0a..6d150ba4 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "a4aafd81b86428bd445c5fb86c8da5c8e73cbbbc" + "version": "1ee4e35b5807c954ccdd5cbac10f6b67c6dec685" }, { "name": "ksonnet", diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 79fc7a21..5ee9c606 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== + alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg== kind: Secret metadata: name: alertmanager-main diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 874520f6..86eb4bdc 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -867,7 +867,7 @@ spec: for: 10m labels: severity: warning - - alert: DeadMansSwitch + - alert: Watchdog annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. From 812d0c81259d5f548340b2ef4ac25d6f9793db3a Mon Sep 17 00:00:00 2001 From: goll Date: Fri, 15 Mar 2019 17:11:29 +0100 Subject: [PATCH 617/638] Fix Watchdog alert typo --- jsonnet/kube-prometheus/alerts/general.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet index 24a4dfd5..8802097e 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -23,7 +23,7 @@ This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSwitch" integration in PagerDuty. + "DeadMansSnitch" integration in PagerDuty. |||, }, expr: 'vector(1)', From 369bb62349e2e57c1423d830a993ebf1d0f49e38 Mon Sep 17 00:00:00 2001 From: goll Date: Fri, 15 Mar 2019 17:19:57 +0100 Subject: [PATCH 618/638] generate-in-docker Fix Watchdog alert typo --- jsonnetfile.lock.json | 4 ++-- manifests/grafana-dashboardDefinitions.yaml | 2 +- manifests/prometheus-rules.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6d150ba4..ad38b5c9 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "1ee4e35b5807c954ccdd5cbac10f6b67c6dec685" + "version": "9faab58c2b1cce4def2cc35045162554b8e4a706" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "9069b2c1be0ce32f63f9a01c4a4f8d69bc4e37d5" + "version": "b8b1a40066bd40bf7612bbb1cc9208f76530f44a" }, { "name": "grafonnet", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 2086bfaa..cdb8ff3f 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5499,7 +5499,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Utilizaion", + "title": "CPU Utilization", "tooltip": { "shared": false, "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 86eb4bdc..7f04e057 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -874,7 +874,7 @@ spec: This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSwitch" integration in PagerDuty. + "DeadMansSnitch" integration in PagerDuty. expr: vector(1) labels: severity: none From 5fcf652cbfd9d94c8a8ff1dc19b77d0ef7f17f64 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Mon, 18 Mar 2019 14:53:56 -0700 Subject: [PATCH 619/638] Adding kustomization files for remote bases Closes #2391 Adds support to allow kustomize users to point to a [remote base](https://github.com/kubernetes-sigs/kustomize/blob/master/examples/remoteBuild.md) of the prometheus-operator and kube-prometheus. Now that kustomize is a part of kubectl, this is provides a very simple way of installing prometheus-operator and kube-prometheus with no additional tools besides kubectl. * [Added] kustomization.yaml: hardcoded kustomization base for prometheus-operator bundle.yaml * [Added] kustomization generation from kube-prometheus example jsonnet --- example.jsonnet | 33 +++++++++++++----- manifests/kustomization.yaml | 66 ++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 8 deletions(-) create mode 100644 manifests/kustomization.yaml diff --git a/example.jsonnet b/example.jsonnet index fcd2bb01..17a88429 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -5,11 +5,28 @@ local kp = }, }; -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +local kustomization = { + apiVersion: 'kustomize.config.k8s.io/v1beta1', + kind: 'Kustomization', + resources: + ['00namespace-' + name + '.yaml' for name in std.objectFields(kp.kubePrometheus)] + + ['0prometheus-operator-' + name + '.yaml' for name in std.objectFields(kp.prometheusOperator)] + + ['node-exporter-' + name + '.yaml' for name in std.objectFields(kp.nodeExporter)] + + ['kube-state-metrics-' + name + '.yaml' for name in std.objectFields(kp.kubeStateMetrics)] + + ['alertmanager-' + name + '.yaml' for name in std.objectFields(kp.alertmanager)] + + ['prometheus-' + name + '.yaml' for name in std.objectFields(kp.prometheus)] + + ['prometheus-adapter-' + name + '.yaml' for name in std.objectFields(kp.prometheusAdapter)] + + ['grafana-' + name + '.yaml' for name in std.objectFields(kp.grafana)], +}; + +local manifests = + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + +manifests { kustomization: kustomization } diff --git a/manifests/kustomization.yaml b/manifests/kustomization.yaml new file mode 100644 index 00000000..6b4f0518 --- /dev/null +++ b/manifests/kustomization.yaml @@ -0,0 +1,66 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- 00namespace-namespace.yaml +- 0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +- 0prometheus-operator-0prometheusCustomResourceDefinition.yaml +- 0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +- 0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +- 0prometheus-operator-clusterRole.yaml +- 0prometheus-operator-clusterRoleBinding.yaml +- 0prometheus-operator-deployment.yaml +- 0prometheus-operator-service.yaml +- 0prometheus-operator-serviceAccount.yaml +- 0prometheus-operator-serviceMonitor.yaml +- node-exporter-clusterRole.yaml +- node-exporter-clusterRoleBinding.yaml +- node-exporter-daemonset.yaml +- node-exporter-service.yaml +- node-exporter-serviceAccount.yaml +- node-exporter-serviceMonitor.yaml +- kube-state-metrics-clusterRole.yaml +- kube-state-metrics-clusterRoleBinding.yaml +- kube-state-metrics-deployment.yaml +- kube-state-metrics-role.yaml +- kube-state-metrics-roleBinding.yaml +- kube-state-metrics-service.yaml +- kube-state-metrics-serviceAccount.yaml +- kube-state-metrics-serviceMonitor.yaml +- alertmanager-alertmanager.yaml +- alertmanager-secret.yaml +- alertmanager-service.yaml +- alertmanager-serviceAccount.yaml +- alertmanager-serviceMonitor.yaml +- prometheus-clusterRole.yaml +- prometheus-clusterRoleBinding.yaml +- prometheus-prometheus.yaml +- prometheus-roleBindingConfig.yaml +- prometheus-roleBindingSpecificNamespaces.yaml +- prometheus-roleConfig.yaml +- prometheus-roleSpecificNamespaces.yaml +- prometheus-rules.yaml +- prometheus-service.yaml +- prometheus-serviceAccount.yaml +- prometheus-serviceMonitor.yaml +- prometheus-serviceMonitorApiserver.yaml +- prometheus-serviceMonitorCoreDNS.yaml +- prometheus-serviceMonitorKubeControllerManager.yaml +- prometheus-serviceMonitorKubeScheduler.yaml +- prometheus-serviceMonitorKubelet.yaml +- prometheus-adapter-apiService.yaml +- prometheus-adapter-clusterRole.yaml +- prometheus-adapter-clusterRoleBinding.yaml +- prometheus-adapter-clusterRoleBindingDelegator.yaml +- prometheus-adapter-clusterRoleServerResources.yaml +- prometheus-adapter-configMap.yaml +- prometheus-adapter-deployment.yaml +- prometheus-adapter-roleBindingAuthReader.yaml +- prometheus-adapter-service.yaml +- prometheus-adapter-serviceAccount.yaml +- grafana-dashboardDatasources.yaml +- grafana-dashboardDefinitions.yaml +- grafana-dashboardSources.yaml +- grafana-deployment.yaml +- grafana-service.yaml +- grafana-serviceAccount.yaml +- grafana-serviceMonitor.yaml From 7e0e5cd188da8b636865cff5308f09709d41e2e2 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Tue, 19 Mar 2019 16:21:00 -0700 Subject: [PATCH 620/638] DRY up resource names --- README.md | 43 +++++++++++++++++++++------ example.jsonnet | 25 +++++++--------- manifests/kustomization.yaml | 56 ++++++++++++++++++------------------ 3 files changed, 73 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index bc320b79..64cc805a 100644 --- a/README.md +++ b/README.md @@ -157,14 +157,41 @@ local kp = }, }; -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +local manifests = + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + +// local kustomization = { +// apiVersion: 'kustomize.config.k8s.io/v1beta1', +// kind: 'Kustomization', +// resources: +// ['00namespace-' + name + '.yaml' for name in std.objectFields(kp.kubePrometheus)] + +// ['0prometheus-operator-' + name + '.yaml' for name in std.objectFields(kp.prometheusOperator)] + +// ['node-exporter-' + name + '.yaml' for name in std.objectFields(kp.nodeExporter)] + +// ['kube-state-metrics-' + name + '.yaml' for name in std.objectFields(kp.kubeStateMetrics)] + +// ['alertmanager-' + name + '.yaml' for name in std.objectFields(kp.alertmanager)] + +// ['prometheus-' + name + '.yaml' for name in std.objectFields(kp.prometheus)] + +// ['prometheus-adapter-' + name + '.yaml' for name in std.objectFields(kp.prometheusAdapter)] + +// ['grafana-' + name + '.yaml' for name in std.objectFields(kp.grafana)], +// }; + +local foo = function(name) { name ::+ '.yaml' }; + +local kustomization = { + apiVersion: 'kustomize.config.k8s.io/v1beta1', + kind: 'Kustomization', + resources: std.map(foo , std.objectFields(manifests)) +}; + +manifests { + kustomization: kustomization +} ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): diff --git a/example.jsonnet b/example.jsonnet index 17a88429..b5259edd 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -5,20 +5,6 @@ local kp = }, }; -local kustomization = { - apiVersion: 'kustomize.config.k8s.io/v1beta1', - kind: 'Kustomization', - resources: - ['00namespace-' + name + '.yaml' for name in std.objectFields(kp.kubePrometheus)] + - ['0prometheus-operator-' + name + '.yaml' for name in std.objectFields(kp.prometheusOperator)] + - ['node-exporter-' + name + '.yaml' for name in std.objectFields(kp.nodeExporter)] + - ['kube-state-metrics-' + name + '.yaml' for name in std.objectFields(kp.kubeStateMetrics)] + - ['alertmanager-' + name + '.yaml' for name in std.objectFields(kp.alertmanager)] + - ['prometheus-' + name + '.yaml' for name in std.objectFields(kp.prometheus)] + - ['prometheus-adapter-' + name + '.yaml' for name in std.objectFields(kp.prometheusAdapter)] + - ['grafana-' + name + '.yaml' for name in std.objectFields(kp.grafana)], -}; - local manifests = { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + @@ -29,4 +15,13 @@ local manifests = { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; -manifests { kustomization: kustomization } +local kustomizationResourceFile(name) = name + ".yaml"; +local kustomization = { + apiVersion: 'kustomize.config.k8s.io/v1beta1', + kind: 'Kustomization', + resources: std.map(kustomizationResourceFile, std.objectFields(manifests)) +}; + +manifests { + kustomization: kustomization, +} diff --git a/manifests/kustomization.yaml b/manifests/kustomization.yaml index 6b4f0518..a74f797f 100644 --- a/manifests/kustomization.yaml +++ b/manifests/kustomization.yaml @@ -12,12 +12,18 @@ resources: - 0prometheus-operator-service.yaml - 0prometheus-operator-serviceAccount.yaml - 0prometheus-operator-serviceMonitor.yaml -- node-exporter-clusterRole.yaml -- node-exporter-clusterRoleBinding.yaml -- node-exporter-daemonset.yaml -- node-exporter-service.yaml -- node-exporter-serviceAccount.yaml -- node-exporter-serviceMonitor.yaml +- alertmanager-alertmanager.yaml +- alertmanager-secret.yaml +- alertmanager-service.yaml +- alertmanager-serviceAccount.yaml +- alertmanager-serviceMonitor.yaml +- grafana-dashboardDatasources.yaml +- grafana-dashboardDefinitions.yaml +- grafana-dashboardSources.yaml +- grafana-deployment.yaml +- grafana-service.yaml +- grafana-serviceAccount.yaml +- grafana-serviceMonitor.yaml - kube-state-metrics-clusterRole.yaml - kube-state-metrics-clusterRoleBinding.yaml - kube-state-metrics-deployment.yaml @@ -26,11 +32,22 @@ resources: - kube-state-metrics-service.yaml - kube-state-metrics-serviceAccount.yaml - kube-state-metrics-serviceMonitor.yaml -- alertmanager-alertmanager.yaml -- alertmanager-secret.yaml -- alertmanager-service.yaml -- alertmanager-serviceAccount.yaml -- alertmanager-serviceMonitor.yaml +- node-exporter-clusterRole.yaml +- node-exporter-clusterRoleBinding.yaml +- node-exporter-daemonset.yaml +- node-exporter-service.yaml +- node-exporter-serviceAccount.yaml +- node-exporter-serviceMonitor.yaml +- prometheus-adapter-apiService.yaml +- prometheus-adapter-clusterRole.yaml +- prometheus-adapter-clusterRoleBinding.yaml +- prometheus-adapter-clusterRoleBindingDelegator.yaml +- prometheus-adapter-clusterRoleServerResources.yaml +- prometheus-adapter-configMap.yaml +- prometheus-adapter-deployment.yaml +- prometheus-adapter-roleBindingAuthReader.yaml +- prometheus-adapter-service.yaml +- prometheus-adapter-serviceAccount.yaml - prometheus-clusterRole.yaml - prometheus-clusterRoleBinding.yaml - prometheus-prometheus.yaml @@ -47,20 +64,3 @@ resources: - prometheus-serviceMonitorKubeControllerManager.yaml - prometheus-serviceMonitorKubeScheduler.yaml - prometheus-serviceMonitorKubelet.yaml -- prometheus-adapter-apiService.yaml -- prometheus-adapter-clusterRole.yaml -- prometheus-adapter-clusterRoleBinding.yaml -- prometheus-adapter-clusterRoleBindingDelegator.yaml -- prometheus-adapter-clusterRoleServerResources.yaml -- prometheus-adapter-configMap.yaml -- prometheus-adapter-deployment.yaml -- prometheus-adapter-roleBindingAuthReader.yaml -- prometheus-adapter-service.yaml -- prometheus-adapter-serviceAccount.yaml -- grafana-dashboardDatasources.yaml -- grafana-dashboardDefinitions.yaml -- grafana-dashboardSources.yaml -- grafana-deployment.yaml -- grafana-service.yaml -- grafana-serviceAccount.yaml -- grafana-serviceMonitor.yaml From 1664600b9177b78948d25cd4f2b89f6e5fa9b33c Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 20 Mar 2019 09:55:27 -0700 Subject: [PATCH 621/638] Moved kustomize to own jsonnet file * [Modified] Makefile to run kustomize.jsonnet * [Modified] Moved kustomization from example.jsonnet to examples/kustomize.jsonnet * [Modified] kustomization file to land at root of kube-prometheus so current users can continue to use `kubectl apply -f manifests` --- Makefile | 2 +- example.jsonnet | 28 +++++---------- examples/kustomize.jsonnet | 27 +++++++++++++++ kustomization.yaml | 66 ++++++++++++++++++++++++++++++++++++ manifests/kustomization.yaml | 66 ------------------------------------ 5 files changed, 102 insertions(+), 87 deletions(-) create mode 100644 examples/kustomize.jsonnet create mode 100644 kustomization.yaml delete mode 100644 manifests/kustomization.yaml diff --git a/Makefile b/Makefile index 6b3651ae..88bf2547 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ generate: manifests **.md manifests: vendor example.jsonnet build.sh rm -rf manifests - ./build.sh + ./build.sh ./examples/kustomize.jsonnet vendor: $(JB_BINARY) jsonnetfile.json jsonnetfile.lock.json rm -rf vendor diff --git a/example.jsonnet b/example.jsonnet index b5259edd..81c473c6 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -5,23 +5,11 @@ local kp = }, }; -local manifests = - { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + - { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + - { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + - { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + - { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + - { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + - { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + - { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; - -local kustomizationResourceFile(name) = name + ".yaml"; -local kustomization = { - apiVersion: 'kustomize.config.k8s.io/v1beta1', - kind: 'Kustomization', - resources: std.map(kustomizationResourceFile, std.objectFields(manifests)) -}; - -manifests { - kustomization: kustomization, -} +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } \ No newline at end of file diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet new file mode 100644 index 00000000..d953f81e --- /dev/null +++ b/examples/kustomize.jsonnet @@ -0,0 +1,27 @@ +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + }; + +local manifests = + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + +local kustomizationResourceFile(name) = "./manifests/" + name + ".yaml"; +local kustomization = { + apiVersion: 'kustomize.config.k8s.io/v1beta1', + kind: 'Kustomization', + resources: std.map(kustomizationResourceFile, std.objectFields(manifests)) +}; + +manifests { + "../kustomization": kustomization, +} \ No newline at end of file diff --git a/kustomization.yaml b/kustomization.yaml new file mode 100644 index 00000000..fc91b965 --- /dev/null +++ b/kustomization.yaml @@ -0,0 +1,66 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ./manifests/00namespace-namespace.yaml +- ./manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +- ./manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +- ./manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +- ./manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +- ./manifests/0prometheus-operator-clusterRole.yaml +- ./manifests/0prometheus-operator-clusterRoleBinding.yaml +- ./manifests/0prometheus-operator-deployment.yaml +- ./manifests/0prometheus-operator-service.yaml +- ./manifests/0prometheus-operator-serviceAccount.yaml +- ./manifests/0prometheus-operator-serviceMonitor.yaml +- ./manifests/alertmanager-alertmanager.yaml +- ./manifests/alertmanager-secret.yaml +- ./manifests/alertmanager-service.yaml +- ./manifests/alertmanager-serviceAccount.yaml +- ./manifests/alertmanager-serviceMonitor.yaml +- ./manifests/grafana-dashboardDatasources.yaml +- ./manifests/grafana-dashboardDefinitions.yaml +- ./manifests/grafana-dashboardSources.yaml +- ./manifests/grafana-deployment.yaml +- ./manifests/grafana-service.yaml +- ./manifests/grafana-serviceAccount.yaml +- ./manifests/grafana-serviceMonitor.yaml +- ./manifests/kube-state-metrics-clusterRole.yaml +- ./manifests/kube-state-metrics-clusterRoleBinding.yaml +- ./manifests/kube-state-metrics-deployment.yaml +- ./manifests/kube-state-metrics-role.yaml +- ./manifests/kube-state-metrics-roleBinding.yaml +- ./manifests/kube-state-metrics-service.yaml +- ./manifests/kube-state-metrics-serviceAccount.yaml +- ./manifests/kube-state-metrics-serviceMonitor.yaml +- ./manifests/node-exporter-clusterRole.yaml +- ./manifests/node-exporter-clusterRoleBinding.yaml +- ./manifests/node-exporter-daemonset.yaml +- ./manifests/node-exporter-service.yaml +- ./manifests/node-exporter-serviceAccount.yaml +- ./manifests/node-exporter-serviceMonitor.yaml +- ./manifests/prometheus-adapter-apiService.yaml +- ./manifests/prometheus-adapter-clusterRole.yaml +- ./manifests/prometheus-adapter-clusterRoleBinding.yaml +- ./manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml +- ./manifests/prometheus-adapter-clusterRoleServerResources.yaml +- ./manifests/prometheus-adapter-configMap.yaml +- ./manifests/prometheus-adapter-deployment.yaml +- ./manifests/prometheus-adapter-roleBindingAuthReader.yaml +- ./manifests/prometheus-adapter-service.yaml +- ./manifests/prometheus-adapter-serviceAccount.yaml +- ./manifests/prometheus-clusterRole.yaml +- ./manifests/prometheus-clusterRoleBinding.yaml +- ./manifests/prometheus-prometheus.yaml +- ./manifests/prometheus-roleBindingConfig.yaml +- ./manifests/prometheus-roleBindingSpecificNamespaces.yaml +- ./manifests/prometheus-roleConfig.yaml +- ./manifests/prometheus-roleSpecificNamespaces.yaml +- ./manifests/prometheus-rules.yaml +- ./manifests/prometheus-service.yaml +- ./manifests/prometheus-serviceAccount.yaml +- ./manifests/prometheus-serviceMonitor.yaml +- ./manifests/prometheus-serviceMonitorApiserver.yaml +- ./manifests/prometheus-serviceMonitorCoreDNS.yaml +- ./manifests/prometheus-serviceMonitorKubeControllerManager.yaml +- ./manifests/prometheus-serviceMonitorKubeScheduler.yaml +- ./manifests/prometheus-serviceMonitorKubelet.yaml diff --git a/manifests/kustomization.yaml b/manifests/kustomization.yaml deleted file mode 100644 index a74f797f..00000000 --- a/manifests/kustomization.yaml +++ /dev/null @@ -1,66 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: -- 00namespace-namespace.yaml -- 0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml -- 0prometheus-operator-0prometheusCustomResourceDefinition.yaml -- 0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml -- 0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml -- 0prometheus-operator-clusterRole.yaml -- 0prometheus-operator-clusterRoleBinding.yaml -- 0prometheus-operator-deployment.yaml -- 0prometheus-operator-service.yaml -- 0prometheus-operator-serviceAccount.yaml -- 0prometheus-operator-serviceMonitor.yaml -- alertmanager-alertmanager.yaml -- alertmanager-secret.yaml -- alertmanager-service.yaml -- alertmanager-serviceAccount.yaml -- alertmanager-serviceMonitor.yaml -- grafana-dashboardDatasources.yaml -- grafana-dashboardDefinitions.yaml -- grafana-dashboardSources.yaml -- grafana-deployment.yaml -- grafana-service.yaml -- grafana-serviceAccount.yaml -- grafana-serviceMonitor.yaml -- kube-state-metrics-clusterRole.yaml -- kube-state-metrics-clusterRoleBinding.yaml -- kube-state-metrics-deployment.yaml -- kube-state-metrics-role.yaml -- kube-state-metrics-roleBinding.yaml -- kube-state-metrics-service.yaml -- kube-state-metrics-serviceAccount.yaml -- kube-state-metrics-serviceMonitor.yaml -- node-exporter-clusterRole.yaml -- node-exporter-clusterRoleBinding.yaml -- node-exporter-daemonset.yaml -- node-exporter-service.yaml -- node-exporter-serviceAccount.yaml -- node-exporter-serviceMonitor.yaml -- prometheus-adapter-apiService.yaml -- prometheus-adapter-clusterRole.yaml -- prometheus-adapter-clusterRoleBinding.yaml -- prometheus-adapter-clusterRoleBindingDelegator.yaml -- prometheus-adapter-clusterRoleServerResources.yaml -- prometheus-adapter-configMap.yaml -- prometheus-adapter-deployment.yaml -- prometheus-adapter-roleBindingAuthReader.yaml -- prometheus-adapter-service.yaml -- prometheus-adapter-serviceAccount.yaml -- prometheus-clusterRole.yaml -- prometheus-clusterRoleBinding.yaml -- prometheus-prometheus.yaml -- prometheus-roleBindingConfig.yaml -- prometheus-roleBindingSpecificNamespaces.yaml -- prometheus-roleConfig.yaml -- prometheus-roleSpecificNamespaces.yaml -- prometheus-rules.yaml -- prometheus-service.yaml -- prometheus-serviceAccount.yaml -- prometheus-serviceMonitor.yaml -- prometheus-serviceMonitorApiserver.yaml -- prometheus-serviceMonitorCoreDNS.yaml -- prometheus-serviceMonitorKubeControllerManager.yaml -- prometheus-serviceMonitorKubeScheduler.yaml -- prometheus-serviceMonitorKubelet.yaml From 6be1adc19702cef17c4bfaea18440e13b55ed8fd Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 20 Mar 2019 10:03:00 -0700 Subject: [PATCH 622/638] regenerating README.md --- README.md | 43 ++++++++----------------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 64cc805a..bc320b79 100644 --- a/README.md +++ b/README.md @@ -157,41 +157,14 @@ local kp = }, }; -local manifests = - { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + - { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + - { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + - { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + - { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + - { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + - { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + - { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; - -// local kustomization = { -// apiVersion: 'kustomize.config.k8s.io/v1beta1', -// kind: 'Kustomization', -// resources: -// ['00namespace-' + name + '.yaml' for name in std.objectFields(kp.kubePrometheus)] + -// ['0prometheus-operator-' + name + '.yaml' for name in std.objectFields(kp.prometheusOperator)] + -// ['node-exporter-' + name + '.yaml' for name in std.objectFields(kp.nodeExporter)] + -// ['kube-state-metrics-' + name + '.yaml' for name in std.objectFields(kp.kubeStateMetrics)] + -// ['alertmanager-' + name + '.yaml' for name in std.objectFields(kp.alertmanager)] + -// ['prometheus-' + name + '.yaml' for name in std.objectFields(kp.prometheus)] + -// ['prometheus-adapter-' + name + '.yaml' for name in std.objectFields(kp.prometheusAdapter)] + -// ['grafana-' + name + '.yaml' for name in std.objectFields(kp.grafana)], -// }; - -local foo = function(name) { name ::+ '.yaml' }; - -local kustomization = { - apiVersion: 'kustomize.config.k8s.io/v1beta1', - kind: 'Kustomization', - resources: std.map(foo , std.objectFields(manifests)) -}; - -manifests { - kustomization: kustomization -} +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): From 6c8d948335bd9a4c029bc2d8652fecd772157cf7 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Thu, 21 Mar 2019 10:47:40 -0700 Subject: [PATCH 623/638] all the new lines --- example.jsonnet | 3 ++- examples/kustomize.jsonnet | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 81c473c6..c446eac1 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -12,4 +12,5 @@ local kp = { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } \ No newline at end of file +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index d953f81e..e1cf7a27 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -24,4 +24,5 @@ local kustomization = { manifests { "../kustomization": kustomization, -} \ No newline at end of file +} + From 8a9f2c71fddc8a502adf8baeed9c51c4585ff5f6 Mon Sep 17 00:00:00 2001 From: Karl Skewes Date: Tue, 26 Mar 2019 22:49:51 +1300 Subject: [PATCH 624/638] contrib/kube-prometheus: Jsonnet snippet for managed kubernetes clusters - rename Follow other snippet file extension names This file is not referenced from other location in repo according to quick grep. --- ...-cluster.jsonnet => kube-prometheus-managed-cluster.libsonnet} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename jsonnet/kube-prometheus/{kube-prometheus-managed-cluster.jsonnet => kube-prometheus-managed-cluster.libsonnet} (100%) diff --git a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet b/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/kube-prometheus-managed-cluster.jsonnet rename to jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet From 10bd451f89c101747a7fbe4b4845a14a0b3455de Mon Sep 17 00:00:00 2001 From: goll Date: Tue, 26 Mar 2019 19:30:39 +0100 Subject: [PATCH 625/638] Update travis to latest minikube, k8s, jsonnet --- Makefile | 4 +++- tests/e2e/travis-e2e.sh | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6b3651ae..ade9ba2c 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ generate-in-docker: ../../hack/jsonnet-docker-image --rm \ -u=$(shell id -u $(USER)):$(shell id -g $(USER)) \ -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ \ + -v $(shell go env GOCACHE):/.cache/go-build \ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ po-jsonnet make generate @@ -43,7 +44,7 @@ test: $(JB_BINARY) ./test.sh test-e2e: - go test -timeout 55m -v ./tests/e2e + go test -timeout 55m -v ./tests/e2e -count=1 test-in-docker: ../../hack/jsonnet-docker-image @echo ">> Compiling assets and generating Kubernetes manifests" @@ -51,6 +52,7 @@ test-in-docker: ../../hack/jsonnet-docker-image --rm \ -u=$(shell id -u $(USER)):$(shell id -g $(USER)) \ -v $(shell dirname $(dir $(abspath $(dir $$PWD)))):/go/src/github.com/coreos/prometheus-operator/ \ + -v $(shell go env GOCACHE):/.cache/go-build \ --workdir /go/src/github.com/coreos/prometheus-operator/contrib/kube-prometheus \ po-jsonnet make test diff --git a/tests/e2e/travis-e2e.sh b/tests/e2e/travis-e2e.sh index 45fb974a..d1149697 100755 --- a/tests/e2e/travis-e2e.sh +++ b/tests/e2e/travis-e2e.sh @@ -12,9 +12,6 @@ SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") "${SCRIPT_DIR}"/../../../../scripts/create-minikube.sh -# waiting for kube-dns to be ready -JSONPATH='{range .items[*]}{@.metadata.name}:{range @.status.conditions[*]}{@.type}={@.status};{end}{end}'; until kubectl -n kube-system get pods -lk8s-app=kube-dns -o jsonpath="$JSONPATH" 2>&1 | grep -q "Ready=True"; do sleep 1;echo "waiting for kube-dns to be available"; kubectl get pods --all-namespaces; done - ( cd "${SCRIPT_DIR}"/../.. || exit kubectl apply -f manifests From 8f34715271768d26e23beb0946b366c096ac8fb0 Mon Sep 17 00:00:00 2001 From: goll Date: Tue, 26 Mar 2019 19:32:00 +0100 Subject: [PATCH 626/638] kube-prometheus: Re-generate --- manifests/grafana-dashboardDefinitions.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index cdb8ff3f..e140f4e5 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\"}\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -6100,7 +6100,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6191,7 +6191,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -7006,21 +7006,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -7374,7 +7374,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=~\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7541,7 +7541,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7624,7 +7624,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7707,7 +7707,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", From 37a0e2b5a6cd5c6d3c7ff7f69bcb84a25cad8e7f Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Tue, 26 Mar 2019 11:44:37 -0700 Subject: [PATCH 627/638] running gen in docker --- README.md | 1 + docs/developing-prometheus-rules-and-grafana-dashboards.md | 1 + examples/kustomize.jsonnet | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bc320b79..115fbbbf 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,7 @@ local kp = { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index 671a8987..17838b82 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -25,6 +25,7 @@ local kp = { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + ``` ## Prometheus rules diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index e1cf7a27..1b6b9038 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -15,14 +15,14 @@ local manifests = { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; -local kustomizationResourceFile(name) = "./manifests/" + name + ".yaml"; +local kustomizationResourceFile(name) = './manifests/' + name + '.yaml'; local kustomization = { apiVersion: 'kustomize.config.k8s.io/v1beta1', kind: 'Kustomization', - resources: std.map(kustomizationResourceFile, std.objectFields(manifests)) + resources: std.map(kustomizationResourceFile, std.objectFields(manifests)), }; manifests { - "../kustomization": kustomization, + '../kustomization': kustomization, } From 1472a902794e1e15304d5446fdc75b87741c9cd3 Mon Sep 17 00:00:00 2001 From: Nick Date: Tue, 2 Apr 2019 08:11:36 +0100 Subject: [PATCH 628/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 2 +- manifests/grafana-dashboardDefinitions.yaml | 26 ++++++++++++++++----- manifests/prometheus-rules.yaml | 20 ++++++++-------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index ad38b5c9..9b315a0b 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "b8b1a40066bd40bf7612bbb1cc9208f76530f44a" + "version": "0669b548b8bc981f2676e7ec70c8f4a05fa39aa7" }, { "name": "grafonnet", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e140f4e5..375b0ed3 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -6855,7 +6855,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, exported_namespace)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -6881,7 +6881,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\"}, persistentvolumeclaim)", "refresh": 2, "regex": "", "sort": 0, @@ -7013,14 +7013,14 @@ items: "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -7124,11 +7124,25 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"}[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ container_name }}", + "legendFormat": "Current: {{ container_name }}", "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" } ], "thresholds": [ diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 7f04e057..2bf44950 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -37,14 +37,14 @@ spec: record: namespace_name:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) @@ -235,11 +235,11 @@ spec: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -603,7 +603,7 @@ spec: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) / sum(node:node_num_cpu:sum) > 1.5 @@ -615,7 +615,7 @@ spec: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) > 1.5 @@ -813,19 +813,19 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: A client certificate used to authenticate to the apiserver is expiring - in less than 7 days. + in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning - alert: KubeClientCertificateExpiration annotations: message: A client certificate used to authenticate to the apiserver is expiring - in less than 24 hours. + in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical - name: alertmanager.rules From cbf7a1b2b843650b822a1b8bdeead445c7aee8d6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 2 Apr 2019 09:43:22 +0200 Subject: [PATCH 629/638] kube-prometheus: Document disabling metrics-server on minikube --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 115fbbbf..d173a2b3 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,12 @@ In order to just try out this stack, start minikube with the following command: $ minikube delete && minikube start --kubernetes-version=v1.13.2 --memory=4096 --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 ``` +> The kube-prometheus stack includes a resource metrics API server, like the metrics-server does. So ensure the metrics-server plugin is disabled on minikube: +> +> ``` +> minikube addons disable metrics-server +> ``` + ## Quickstart This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). From df17b1ca3836da4767aab0bb54b412d1df7e85f9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 2 Apr 2019 11:43:01 +0200 Subject: [PATCH 630/638] kube-prometheus: Fix double accounting of pod usage cAdvisor exposes metrics for each cgroup hierachy step, and containers are part of the respective pod's hierarchy, causing double accounting when not filtered appropriately. --- .../prometheus-adapter/prometheus-adapter.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet index 193a36a5..6d6604bc 100644 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet @@ -19,7 +19,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; config: ||| resourceRules: cpu: - containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>) nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: @@ -31,7 +31,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; resource: pod containerLabel: container_name memory: - containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>) nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: From 87f8b8c9312b883f2dee5d918d6952a9e3bdd408 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 2 Apr 2019 15:03:23 +0200 Subject: [PATCH 631/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 10 +++++----- manifests/grafana-deployment.yaml | 2 +- manifests/prometheus-adapter-configMap.yaml | 4 ++-- manifests/prometheus-prometheus.yaml | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 9b315a0b..9024b9da 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9faab58c2b1cce4def2cc35045162554b8e4a706" + "version": "3623fd0dfc7be15ab2fbe648217f238f614b2d1f" }, { "name": "ksonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5" + "version": "d270f529db9eb750425a173188c534ab92532f47" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e" + "version": "2c635c3310c6e61720871ac94d6d2572e37b83f7" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "9ddf5a198b0f7c898dc061158ea427112acbae11" + "version": "de2ec3f0f9115da2d47dc6b86af9b402e2bf146d" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1" + "version": "a621d807f061e1dd635033a8d6bc261461429e27" } ] } diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 2caaca6d..bd6bf678 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:6.0.0-beta1 + - image: grafana/grafana:6.0.1 name: grafana ports: - containerPort: 3000 diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml index d6ebd78f..15434fc5 100644 --- a/manifests/prometheus-adapter-configMap.yaml +++ b/manifests/prometheus-adapter-configMap.yaml @@ -3,7 +3,7 @@ data: config.yaml: | resourceRules: cpu: - containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>) nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: @@ -15,7 +15,7 @@ data: resource: pod containerLabel: container_name memory: - containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>) nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) resources: overrides: diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index c16914b0..c5952cc7 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -29,4 +29,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.5.0 + version: v2.7.2 From 533fb1740a0c04bc240d01011c1c78f7f3f5d35c Mon Sep 17 00:00:00 2001 From: Artur Pedroso Date: Thu, 4 Apr 2019 16:03:34 +0100 Subject: [PATCH 632/638] Update README.md Fix go get jsonnet command. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d173a2b3..5b978f71 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} ``` -> Note you need `jsonnet` (`go get github.com/google/go-jsonnet/jsonnet`) and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed to run `build.sh`. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. +> Note you need `jsonnet` (`go get github.com/google/go-jsonnet/cmd/jsonnet`) and `gojsontoyaml` (`go get github.com/brancz/gojsontoyaml`) installed to run `build.sh`. If you just want json output, not yaml, then you can skip the pipe and everything afterwards. This script runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml. From 438381c3e3318bb96a96d38d99f63a53d288397c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 8 Apr 2019 15:56:39 +0200 Subject: [PATCH 633/638] kube-prometheus: Set Alertmanagere sessionsAffinity to ClientIP --- jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 47e61e29..c46a87dc 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -67,6 +67,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local alertmanagerPort = servicePort.newNamed('web', 9093, 'web'); service.new('alertmanager-' + $._config.alertmanager.name, { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, alertmanagerPort) + + service.mixin.spec.withSessionAffinity('ClientIP') + service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ alertmanager: $._config.alertmanager.name }), From 7e4381ca87812e48b6d96a05da4150bf93b28af9 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 8 Apr 2019 16:13:31 +0200 Subject: [PATCH 634/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 4 +- manifests/alertmanager-service.yaml | 1 + manifests/grafana-dashboardDefinitions.yaml | 2070 ++++++++++++++++++- manifests/grafana-deployment.yaml | 12 + manifests/prometheus-rules.yaml | 33 + 5 files changed, 2084 insertions(+), 36 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 9024b9da..ac2cb657 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3623fd0dfc7be15ab2fbe648217f238f614b2d1f" + "version": "3bd8b755d11bcf9be1e70b5b7ffe0ad881f300fe" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "0669b548b8bc981f2676e7ec70c8f4a05fa39aa7" + "version": "3991660410dab201bb1f60b84d26e027c6c877e4" }, { "name": "grafonnet", diff --git a/manifests/alertmanager-service.yaml b/manifests/alertmanager-service.yaml index 0c756793..df4c9ff5 100644 --- a/manifests/alertmanager-service.yaml +++ b/manifests/alertmanager-service.yaml @@ -13,3 +13,4 @@ spec: selector: alertmanager: main app: alertmanager + sessionAffinity: ClientIP diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 375b0ed3..d07d6a06 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2592,6 +2592,42 @@ items: "pattern": "Time", "type": "hidden" }, + { + "alias": "Pods", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "link": true, + "linkTooltip": "Drill down to pods", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Workloads", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "link": true, + "linkTooltip": "Drill down to workloads", + "linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, { "alias": "CPU Usage", "colorMode": null, @@ -2603,7 +2639,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value #C", "thresholds": [ ], @@ -2621,7 +2657,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #B", + "pattern": "Value #D", "thresholds": [ ], @@ -2639,7 +2675,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #C", + "pattern": "Value #E", "thresholds": [ ], @@ -2657,7 +2693,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #D", + "pattern": "Value #F", "thresholds": [ ], @@ -2675,7 +2711,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #E", + "pattern": "Value #G", "thresholds": [ ], @@ -2691,7 +2727,7 @@ items: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, - "linkTooltip": "Drill down", + "linkTooltip": "Drill down to pods", "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2718,7 +2754,7 @@ items: ], "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2727,7 +2763,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2736,7 +2772,7 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2745,7 +2781,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2754,13 +2790,31 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "E", "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 } ], "thresholds": [ @@ -2958,43 +3012,43 @@ items: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Pods", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", + "decimals": 0, + "link": true, + "linkTooltip": "Drill down to pods", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #A", "thresholds": [ ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Requests", + "alias": "Workloads", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", + "decimals": 0, + "link": true, + "linkTooltip": "Drill down to workloads", + "linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #B", "thresholds": [ ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Requests %", + "alias": "CPU Usage", "colorMode": null, "colors": [ @@ -3009,10 +3063,10 @@ items: ], "type": "number", - "unit": "percentunit" + "unit": "short" }, { - "alias": "Memory Limits", + "alias": "Memory Usage", "colorMode": null, "colors": [ @@ -3030,7 +3084,7 @@ items: "unit": "bytes" }, { - "alias": "Memory Limits %", + "alias": "Memory Requests", "colorMode": null, "colors": [ @@ -3043,6 +3097,60 @@ items: "pattern": "Value #E", "thresholds": [ + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", + "thresholds": [ + ], "type": "number", "unit": "percentunit" @@ -3056,7 +3164,7 @@ items: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, - "linkTooltip": "Drill down", + "linkTooltip": "Drill down to pods", "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3083,7 +3191,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", + "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3092,7 +3200,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3101,7 +3209,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3110,7 +3218,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3119,13 +3227,31 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "E", "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 } ], "thresholds": [ @@ -4207,7 +4333,7 @@ items: ] }, "timezone": "", - "title": "Kubernetes / Compute Resources / Namespace", + "title": "Kubernetes / Compute Resources / Namespace (Pods)", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } @@ -5205,6 +5331,1882 @@ items: metadata: name: grafana-dashboard-k8s-resources-pod namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-workload.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "workload", + "multi": false, + "name": "workload", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "type", + "multi": false, + "name": "type", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes / Compute Resources / Workload", + "uid": "a164a7f0339f99e89cea5cb47e9be617", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-workload + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-workloads-namespace.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}} - {{workload_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Running Pods", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Workload", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", + "pattern": "workload", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Workload Type", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "workload_type", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}) by (workload, workload_type)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}} - {{workload_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Running Pods", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Workload", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", + "pattern": "workload", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Workload Type", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "workload_type", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}) by (workload, workload_type)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kubernetes / Compute Resources / Namespace (Workloads)", + "uid": "a87fb0d919ec0ea5f6543124e16c42a5", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-workloads-namespace + namespace: monitoring - apiVersion: v1 data: nodes.json: |- diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index bd6bf678..18133259 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -57,6 +57,12 @@ spec: - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod name: grafana-dashboard-k8s-resources-pod readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload + name: grafana-dashboard-k8s-resources-workload + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace + name: grafana-dashboard-k8s-resources-workloads-namespace + readOnly: false - mountPath: /grafana-dashboard-definitions/0/nodes name: grafana-dashboard-nodes readOnly: false @@ -99,6 +105,12 @@ spec: - configMap: name: grafana-dashboard-k8s-resources-pod name: grafana-dashboard-k8s-resources-pod + - configMap: + name: grafana-dashboard-k8s-resources-workload + name: grafana-dashboard-k8s-resources-workload + - configMap: + name: grafana-dashboard-k8s-resources-workloads-namespace + name: grafana-dashboard-k8s-resources-workloads-namespace - configMap: name: grafana-dashboard-nodes name: grafana-dashboard-nodes diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 2bf44950..e4607322 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -49,6 +49,39 @@ spec: label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + sum( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: deployment + record: mixin_pod_workload + - expr: | + sum( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: daemonset + record: mixin_pod_workload + - expr: | + sum( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) by (namespace, workload, pod) + labels: + workload_type: statefulset + record: mixin_pod_workload - name: kube-scheduler.rules rules: - expr: | From f8bd9b89f7922b0341c71c0e42d58c5fd1b5086b Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 Apr 2019 15:28:44 +0200 Subject: [PATCH 635/638] kube-prometheus: Don't drop non-pod metrics These metrics can still be valuable to troubleshoot components running in cgroups on the host, outside the scope of Kubernetes pods. --- jsonnet/kube-prometheus/prometheus/prometheus.libsonnet | 7 ------- 1 file changed, 7 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index ba2adb05..57c54109 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -283,13 +283,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; }, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', metricRelabelings: [ - // Drop container_* metrics with no image. - { - sourceLabels: ['__name__', 'image'], - regex: 'container_([a-z_]+);', - action: 'drop', - }, - // Drop a bunch of metrics which are disabled but still sent, see // https://github.com/google/cadvisor/issues/1925. { From d63f0c6591df0f2ac75b788d32836f9d98cf7ae0 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 9 Apr 2019 15:35:54 +0200 Subject: [PATCH 636/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 2 +- manifests/prometheus-serviceMonitorKubelet.yaml | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index ac2cb657..6b64adb0 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3bd8b755d11bcf9be1e70b5b7ffe0ad881f300fe" + "version": "3ba7822228654f3bc864a7c37139665c7549739a" }, { "name": "ksonnet", diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml index 590a5cd4..91da377a 100644 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -18,11 +18,6 @@ spec: honorLabels: true interval: 30s metricRelabelings: - - action: drop - regex: container_([a-z_]+); - sourceLabels: - - __name__ - - image - action: drop regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) sourceLabels: From 30d2ef0f25be54dd0a2f56efcf03a27d7d4e83ef Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 10 Apr 2019 16:58:37 +0200 Subject: [PATCH 637/638] kube-prometheus: Add clock skew and node network interface alerts --- jsonnet/kube-prometheus/alerts/node.libsonnet | 75 +++++++++++++++++++ .../node-exporter/node-exporter.libsonnet | 1 + 2 files changed, 76 insertions(+) diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 37fff428..3dca1b0a 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -32,6 +32,81 @@ }, ], }, + { + name: 'node-time', + rules: [ + { + alert: 'ClockSkewDetected', + annotations: { + message: 'Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host.', + }, + expr: ||| + node_ntp_offset_seconds{%(nodeExporterSelector)s} < -0.03 or node_ntp_offset_seconds{%(nodeExporterSelector)s} > 0.03 + ||| % $._config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + ], + }, + { + name: 'node-network', + rules: [ + { + alert: 'NetworkReceiveErrors', + annotations: { + message: 'Network interface "{{ $labels.device }}" showing receive errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + }, + expr: ||| + rate(node_network_receive_errs_total{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 0 + ||| % $._config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NetworkTransmitErrors', + annotations: { + message: 'Network interface "{{ $labels.device }}" showing transmit errors on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + }, + expr: ||| + rate(node_network_transmit_errs_total{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 0 + ||| % $._config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeNetworkInterfaceDown', + annotations: { + message: 'Network interface "{{ $labels.device }}" down on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + }, + expr: ||| + node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s} == 0 + ||| % $._config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeNetworkInterfaceFlapping', + annotations: { + message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + }, + expr: ||| + changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 + ||| % $._config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + ], + }, ], }, } diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index 6b7f7f8a..8aae5b86 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -101,6 +101,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; // Once node exporter is being released with those settings, this can be removed. '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)', '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$', + '--collector.ntp', ]) + container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) + container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + From 788c82860a5cce060cc04eb67f5e5f5e1d7d6789 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 10 Apr 2019 17:43:24 +0200 Subject: [PATCH 638/638] kube-prometheus: Re-generate --- jsonnetfile.lock.json | 2 +- manifests/node-exporter-daemonset.yaml | 1 + manifests/prometheus-rules.yaml | 49 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6b64adb0..ed5c26cc 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "3ba7822228654f3bc864a7c37139665c7549739a" + "version": "82817c8f9277c82ca164a6ef75bf476e56f24521" }, { "name": "ksonnet", diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index f0729b5d..56e4b90b 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -22,6 +22,7 @@ spec: - --path.rootfs=/host/root - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + - --collector.ntp image: quay.io/prometheus/node-exporter:v0.17.0 name: node-exporter resources: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index e4607322..ee1f21ba 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -931,6 +931,55 @@ spec: for: 10m labels: severity: critical + - name: node-time + rules: + - alert: ClockSkewDetected + annotations: + message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod + }}. Ensure NTP is configured correctly on this host. + expr: | + node_ntp_offset_seconds{job="node-exporter"} < -0.03 or node_ntp_offset_seconds{job="node-exporter"} > 0.03 + for: 2m + labels: + severity: warning + - name: node-network + rules: + - alert: NetworkReceiveErrors + annotations: + message: Network interface "{{ $labels.device }}" showing receive errors on + node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 + for: 2m + labels: + severity: warning + - alert: NetworkTransmitErrors + annotations: + message: Network interface "{{ $labels.device }}" showing transmit errors + on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 + for: 2m + labels: + severity: warning + - alert: NodeNetworkInterfaceDown + annotations: + message: Network interface "{{ $labels.device }}" down on node-exporter {{ + $labels.namespace }}/{{ $labels.pod }}" + expr: | + node_network_up{job="node-exporter",device!~"veth.+"} == 0 + for: 2m + labels: + severity: warning + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{ $labels.device }}" changing it's up status + often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning - name: prometheus.rules rules: - alert: PrometheusConfigReloadFailed