diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 6fabeb58..eacda694 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -18,7 +18,7 @@ "subdir": "" } }, - "version": "master" + "version": "release-0.3" }, { "name": "grafana", diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index f89760c7..25ad9e1b 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -72,8 +72,8 @@ "subdir": "" } }, - "version": "68f82d2a428d91df57e9af43739981a6a8ede897", - "sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM=" + "version": "7bf9a2a321356a7625509fe458132c26b2e33b29", + "sum": "4c1hfhiHdWmR80Wzw8ntr9Ef7z2JZHYNUU25+RKi8yw=" }, { "name": "node-mixin", diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 66474aa9..c20ec4f1 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -22944,7 +22944,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -22952,7 +22952,7 @@ items: "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -22960,7 +22960,7 @@ items: "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", "format": "time_series", "interval": "1m", "intervalFactor": 2, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 422893cb..c911ac7b 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -40,10 +40,10 @@ spec: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) record: instance:node_vmstat_pgmajfault:rate1m - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) record: instance_device:node_disk_io_time_seconds:rate1m - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) record: instance_device:node_disk_io_time_weighted_seconds:rate1m - expr: | sum without (device) ( @@ -95,23 +95,33 @@ spec: - expr: | sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - expr: | container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) record: node_namespace_pod_container:container_memory_working_set_bytes - expr: | container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) record: node_namespace_pod_container:container_memory_rss - expr: | container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) record: node_namespace_pod_container:container_memory_cache - expr: | container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) record: node_namespace_pod_container:container_memory_swap - expr: | sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) @@ -139,35 +149,39 @@ spec: ) record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: deployment record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: daemonset record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: statefulset record: mixin_pod_workload @@ -221,10 +235,13 @@ spec: - name: node.rules rules: - expr: | - sum(min(kube_pod_info) by (cluster, node)) + sum(min(kube_pod_info{node!=""}) by (cluster, node)) record: ':kube_pod_info_node_count:' - expr: | - max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (cluster, node) (sum by (node, cpu) ( @@ -445,7 +462,13 @@ spec: state for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | - sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 for: 15m labels: severity: critical @@ -1023,6 +1046,17 @@ spec: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical + - alert: AggregatedAPIErrors + annotations: + message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has + reported errors. The number of errors have increased for it in the past + five minutes. High values indicate that the availability of the service + changes too often. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + labels: + severity: warning - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery.