jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
@@ -22944,7 +22944,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
@@ -22952,7 +22952,7 @@ items:
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
@@ -22960,7 +22960,7 @@ items:
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
|
||||
@@ -40,10 +40,10 @@ spec:
|
||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
@@ -95,23 +95,33 @@ spec:
|
||||
- expr: |
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
@@ -139,35 +149,39 @@ spec:
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
@@ -221,10 +235,13 @@ spec:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(min(kube_pod_info) by (cluster, node))
|
||||
sum(min(kube_pod_info{node!=""}) by (cluster, node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
topk by(namespace, pod) (1,
|
||||
max by (node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
@@ -445,7 +462,13 @@ spec:
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||
sum by (namespace, pod) (
|
||||
max by(namespace, pod) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
|
||||
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
|
||||
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1023,6 +1046,17 @@ spec:
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
||||
reported errors. The number of errors have increased for it in the past
|
||||
five minutes. High values indicate that the availability of the service
|
||||
changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||
|
||||
Reference in New Issue
Block a user