From f9b03ddd9d92143645a248ae34956efe0db38f31 Mon Sep 17 00:00:00 2001 From: Andrey Klimentyev Date: Tue, 10 Apr 2018 14:54:48 +0300 Subject: [PATCH] kube-prometheus: fixed CPU accounting Currently, node recording rules feature an incorrect idle CPU accounting. This change aims to fix that. --- assets/prometheus/rules/node.rules.yaml | 6 +++--- manifests/prometheus/prometheus-k8s-rules.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 9a9d599c..e678ca84 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -2,7 +2,7 @@ groups: - name: node.rules rules: - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) @@ -12,10 +12,10 @@ groups: - record: instance:node_network_transmit_bytes:rate:sum expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle"}[5m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:ratio expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 403e4383..552aad5e 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -498,7 +498,7 @@ data: - name: node.rules rules: - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) @@ -508,10 +508,10 @@ data: - record: instance:node_network_transmit_bytes:rate:sum expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle"}[5m])) + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:ratio expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown