jsonnet: bump kubernetes mixin
This commit is contained in:
@@ -27,7 +27,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "0afc72e70df6048c6b65fd3e4968e53b0812b30c"
|
"version": "24ea0d6e33a415e07ec7b675d74dea3cf01fde73"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@@ -98,6 +98,16 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "154d59dee72b894f7245d8d78c9344d1211d521f"
|
"version": "154d59dee72b894f7245d8d78c9344d1211d521f"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "promgrafonnet",
|
||||||
|
"source": {
|
||||||
|
"git": {
|
||||||
|
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
|
||||||
|
"subdir": "lib/promgrafonnet"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"version": "24ea0d6e33a415e07ec7b675d74dea3cf01fde73"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -48,12 +48,6 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/controller-manager
|
- mountPath: /grafana-dashboard-definitions/0/controller-manager
|
||||||
name: grafana-dashboard-controller-manager
|
name: grafana-dashboard-controller-manager
|
||||||
readOnly: false
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
readOnly: false
|
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-node-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
readOnly: false
|
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@@ -123,12 +117,6 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-controller-manager
|
name: grafana-dashboard-controller-manager
|
||||||
name: grafana-dashboard-controller-manager
|
name: grafana-dashboard-controller-manager
|
||||||
- configMap:
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
- configMap:
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
|
@@ -82,13 +82,6 @@ spec:
|
|||||||
- expr: |
|
- expr: |
|
||||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||||
record: namespace:container_memory_usage_bytes:sum
|
record: namespace:container_memory_usage_bytes:sum
|
||||||
- expr: |
|
|
||||||
sum by (namespace, label_name) (
|
|
||||||
sum(container_memory_usage_bytes{job="kubelet",image!="", container!="POD"}) by (pod, namespace)
|
|
||||||
* on (namespace, pod)
|
|
||||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
|
||||||
)
|
|
||||||
record: namespace:container_memory_usage_bytes:sum
|
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by (namespace, label_name) (
|
sum by (namespace, label_name) (
|
||||||
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||||
@@ -214,177 +207,9 @@ spec:
|
|||||||
node_namespace_pod:kube_pod_info:
|
node_namespace_pod:kube_pod_info:
|
||||||
))
|
))
|
||||||
record: node:node_num_cpu:sum
|
record: node:node_num_cpu:sum
|
||||||
- expr: |
|
|
||||||
1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
|
||||||
record: :node_cpu_utilisation:avg1m
|
|
||||||
- expr: |
|
|
||||||
1 - avg by (node) (
|
|
||||||
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:)
|
|
||||||
record: node:node_cpu_utilisation:avg1m
|
|
||||||
- expr: |
|
|
||||||
node:node_cpu_utilisation:avg1m
|
|
||||||
*
|
|
||||||
node:node_num_cpu:sum
|
|
||||||
/
|
|
||||||
scalar(sum(node:node_num_cpu:sum))
|
|
||||||
record: node:cluster_cpu_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
sum(node_load1{job="node-exporter"})
|
|
||||||
/
|
|
||||||
sum(node:node_num_cpu:sum)
|
|
||||||
record: ':node_cpu_saturation_load1:'
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
node_load1{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
/
|
|
||||||
node:node_num_cpu:sum
|
|
||||||
record: 'node:node_cpu_saturation_load1:'
|
|
||||||
- expr: |
|
|
||||||
1 -
|
|
||||||
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
/
|
|
||||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
||||||
record: ':node_memory_utilisation:'
|
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||||
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
||||||
- expr: |
|
|
||||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
||||||
record: :node_memory_MemTotal_bytes:sum
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_bytes_available:sum
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_bytes_total:sum
|
|
||||||
- expr: |
|
|
||||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
||||||
/
|
|
||||||
node:node_memory_bytes_total:sum
|
|
||||||
record: node:node_memory_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
||||||
/
|
|
||||||
scalar(sum(node:node_memory_bytes_total:sum))
|
|
||||||
record: node:cluster_memory_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
1e3 * sum(
|
|
||||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
||||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
||||||
)
|
|
||||||
record: :node_memory_swap_io_bytes:sum_rate
|
|
||||||
- expr: |
|
|
||||||
1 -
|
|
||||||
sum by (node) (
|
|
||||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
/
|
|
||||||
sum by (node) (
|
|
||||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: 'node:node_memory_utilisation:'
|
|
||||||
- expr: |
|
|
||||||
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
|
||||||
record: 'node:node_memory_utilisation_2:'
|
|
||||||
- expr: |
|
|
||||||
1e3 * sum by (node) (
|
|
||||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
||||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_swap_io_bytes:sum_rate
|
|
||||||
- expr: |
|
|
||||||
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
||||||
record: :node_disk_utilisation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg by (node) (
|
|
||||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_disk_utilisation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
||||||
record: :node_disk_saturation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg by (node) (
|
|
||||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_disk_saturation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
|
||||||
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
record: 'node:node_filesystem_usage:'
|
|
||||||
- expr: |
|
|
||||||
max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
record: 'node:node_filesystem_avail:'
|
|
||||||
- expr: |
|
|
||||||
sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
||||||
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
record: :node_net_utilisation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
||||||
irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_net_utilisation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
||||||
sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
record: :node_net_saturation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
||||||
irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_net_saturation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
max(
|
|
||||||
max(
|
|
||||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
||||||
) by (node, host_ip)
|
|
||||||
* on (host_ip) group_right (node)
|
|
||||||
label_replace(
|
|
||||||
(max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
||||||
)
|
|
||||||
) by (node)
|
|
||||||
record: 'node:node_inodes_total:'
|
|
||||||
- expr: |
|
|
||||||
max(
|
|
||||||
max(
|
|
||||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
||||||
) by (node, host_ip)
|
|
||||||
* on (host_ip) group_right (node)
|
|
||||||
label_replace(
|
|
||||||
(max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
||||||
)
|
|
||||||
) by (node)
|
|
||||||
record: 'node:node_inodes_free:'
|
|
||||||
- name: kube-prometheus-node-recording.rules
|
- name: kube-prometheus-node-recording.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||||
@@ -663,17 +488,17 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||||
expr: |
|
expr: |
|
||||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePodNotReady
|
- alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
state for longer than an hour.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
@@ -692,13 +517,13 @@ spec:
|
|||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
||||||
matched the expected number of replicas for longer than an hour.
|
matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||||
!=
|
!=
|
||||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
@@ -806,7 +631,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_status_failed{job="kube-state-metrics"} > 0
|
kube_job_status_failed{job="kube-state-metrics"} > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: kubernetes-resources
|
- name: kubernetes-resources
|
||||||
@@ -940,7 +765,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||||
expr: |
|
expr: |
|
||||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeVersionMismatch
|
- alert: KubeVersionMismatch
|
||||||
@@ -950,7 +775,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||||
expr: |
|
expr: |
|
||||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientErrors
|
- alert: KubeClientErrors
|
||||||
|
Reference in New Issue
Block a user