bump rules

This commit is contained in:
Dmitry Verkhoturov
2018-11-07 12:59:40 +03:00
parent 0372a60d0c
commit 5a0835fa26
2 changed files with 58 additions and 58 deletions

View File

@@ -796,7 +796,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n",
"expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{node}}",
@@ -1920,7 +1920,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "1 - avg(rate(node_cpu{mode=\"idle\"}[1m]))",
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -2172,7 +2172,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "1 - sum(:node_memory_MemFreeCachedBuffers:sum) / sum(:node_memory_MemTotal:sum)",
"expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -2256,7 +2256,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal:sum)",
"expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -2340,7 +2340,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal:sum)",
"expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -5003,7 +5003,7 @@ items:
},
"yaxes": [
{
"format": "percentunit",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
@@ -5011,7 +5011,7 @@ items:
"show": true
},
{
"format": "percentunit",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
@@ -5064,7 +5064,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100",
"expr": "avg by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{cpu}}",
@@ -5076,7 +5076,7 @@ items:
],
"timeFrom": null,
"timeShift": null,
"title": "System load",
"title": "Usage Per Core",
"tooltip": {
"shared": true,
"sort": 0,
@@ -5168,7 +5168,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "avg (sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n",
"expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n",
"format": "time_series",
"intervalFactor": 10,
"legendFormat": "{{ cpu }}",
@@ -5276,7 +5276,7 @@ items:
"tableColumn": "",
"targets": [
{
"expr": "avg(sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n",
"expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": ""
@@ -5352,28 +5352,28 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "max(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n)\n",
"expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory used",
"refId": "A"
},
{
"expr": "max(node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"})",
"expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory buffers",
"refId": "B"
},
{
"expr": "max(node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"})",
"expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory cached",
"refId": "C"
},
{
"expr": "max(node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"})",
"expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "memory free",
@@ -5481,7 +5481,7 @@ items:
"tableColumn": "",
"targets": [
{
"expr": "max(\n (\n (\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n",
"expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": ""
@@ -5564,21 +5564,21 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "max(rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "read",
"refId": "A"
},
{
"expr": "max(rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "written",
"refId": "B"
},
{
"expr": "max(rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "io time",
@@ -5773,7 +5773,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "max(rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))",
"expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}}",
@@ -5864,7 +5864,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "max(rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))",
"expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}}",
@@ -5958,7 +5958,7 @@ items:
"options": [
],
"query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)",
"query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 0,

View File

@@ -122,17 +122,17 @@ spec:
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
node_cpu{job="node-exporter"}
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: |
1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m]))
1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilisation:avg1m
- expr: |
1 - avg by (node) (
rate(node_cpu{job="node-exporter",mode="idle"}[1m])
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilisation:avg1m
@@ -152,26 +152,26 @@ spec:
record: 'node:node_cpu_saturation_load1:'
- expr: |
1 -
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
/
sum(node_memory_MemTotal{job="node-exporter"})
sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: ':node_memory_utilisation:'
- expr: |
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
record: :node_memory_MemFreeCachedBuffers:sum
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
record: :node_memory_MemFreeCachedBuffers_bytes:sum
- expr: |
sum(node_memory_MemTotal{job="node-exporter"})
record: :node_memory_MemTotal:sum
sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: :node_memory_MemTotal_bytes:sum
- expr: |
sum by (node) (
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |
sum by (node) (
node_memory_MemTotal{job="node-exporter"}
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
@@ -190,13 +190,13 @@ spec:
- expr: |
1 -
sum by (node) (
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal{job="node-exporter"}
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
@@ -213,21 +213,21 @@ spec:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]))
record: :node_disk_utilisation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
avg(irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
@@ -241,25 +241,25 @@ spec:
max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |
sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m]))
sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_utilisation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m]))
(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilisation:sum_irate
- expr: |
sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m]))
sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m]))
(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
@@ -688,8 +688,8 @@ spec:
severity: warning
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking
more than 1h to complete.
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
than 1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
expr: |
time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
@@ -698,8 +698,8 @@ spec:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
one hour to complete.
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@@ -739,7 +739,7 @@ spec:
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal)
sum(node_memory_MemTotal_bytes)
>
(count(node:node_num_cpu:sum)-1)
/
@@ -766,7 +766,7 @@ spec:
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal{job="node-exporter"})
sum(node_memory_MemTotal_bytes{job="node-exporter"})
> 1.5
for: 5m
labels:
@@ -801,7 +801,7 @@ spec:
- alert: KubePersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
}}% free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: |
@@ -816,14 +816,14 @@ spec:
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value }} bytes are available.
days. Currently {{ printf "%0.2f" $value }}% is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
(
kubelet_volume_stats_used_bytes{job="kubelet"}
100 * (
kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
) > 0.85
) < 15
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m