From 4dd632afc00c919dd1c796093747ce5cd2bad6f5 Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Tue, 26 May 2020 18:21:38 +0200 Subject: [PATCH] jsonnet: pin kubernetes-mixin version Signed-off-by: Damien Grisonnet --- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.lock.json | 4 +- manifests/grafana-dashboardDefinitions.yaml | 10 ++--- manifests/prometheus-rules.yaml | 41 +++++++++++++++------ 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index fa7545af..2b1ea22e 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -18,7 +18,7 @@ "subdir": "" } }, - "version": "master" + "version": "release-0.2" }, { "name": "grafana", diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index d97e7f93..7e9b270d 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -72,8 +72,8 @@ "subdir": "" } }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", - "sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU=" + "version": "a132ade95740f9364e477ae8e730eabd650d14cb", + "sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A=" }, { "name": "node-mixin", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 5cfa21f5..7d9a0af8 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -269,7 +269,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))", + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\"}[5m])) by (verb, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}}", @@ -22649,7 +22649,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -22657,7 +22657,7 @@ items: "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -22665,7 +22665,7 @@ items: "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -24915,7 +24915,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by(container) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})", + "expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container }}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index f9609d61..84bb6426 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -40,10 +40,10 @@ spec: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) record: instance:node_vmstat_pgmajfault:rate1m - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) record: instance_device:node_disk_io_time_seconds:rate1m - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) record: instance_device:node_disk_io_time_weighted_seconds:rate1m - expr: | sum without (device) ( @@ -68,17 +68,22 @@ spec: - name: kube-apiserver.rules rules: - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) + sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) + record: cluster:apiserver_request_duration_seconds:mean5m + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.99" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.9" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.5" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile @@ -431,7 +436,7 @@ spec: state for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | - sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0 + sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 for: 15m labels: severity: critical @@ -759,12 +764,26 @@ spec: rules: - alert: KubeAPILatencyHigh annotations: - message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{ $labels.verb }} {{ $labels.resource }}. + message: The API server has an abnormal latency of {{ $value }} seconds for + {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1 - for: 10m + ( + cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} + > + on (verb) group_left() + ( + avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + + + 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + ) + ) > on (verb) group_left() + 1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + and on (verb,resource) + cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} + > + 1 + for: 5m labels: severity: warning - alert: KubeAPILatencyHigh @@ -773,7 +792,7 @@ spec: for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4 + cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4 for: 10m labels: severity: critical