jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
@@ -18,7 +18,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "master"
|
"version": "release-0.3"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
|
|||||||
@@ -72,8 +72,8 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "68f82d2a428d91df57e9af43739981a6a8ede897",
|
"version": "7bf9a2a321356a7625509fe458132c26b2e33b29",
|
||||||
"sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM="
|
"sum": "4c1hfhiHdWmR80Wzw8ntr9Ef7z2JZHYNUU25+RKi8yw="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "node-mixin",
|
"name": "node-mixin",
|
||||||
|
|||||||
@@ -22944,7 +22944,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
@@ -22952,7 +22952,7 @@ items:
|
|||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
@@ -22960,7 +22960,7 @@ items:
|
|||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
|
|||||||
@@ -40,10 +40,10 @@ spec:
|
|||||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||||
record: instance:node_vmstat_pgmajfault:rate1m
|
record: instance:node_vmstat_pgmajfault:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
@@ -95,23 +95,33 @@ spec:
|
|||||||
- expr: |
|
- expr: |
|
||||||
sum by (cluster, namespace, pod, container) (
|
sum by (cluster, namespace, pod, container) (
|
||||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||||
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
|
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||||
|
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||||
|
)
|
||||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||||
- expr: |
|
- expr: |
|
||||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||||
|
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||||
|
)
|
||||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||||
- expr: |
|
- expr: |
|
||||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||||
|
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||||
|
)
|
||||||
record: node_namespace_pod_container:container_memory_rss
|
record: node_namespace_pod_container:container_memory_rss
|
||||||
- expr: |
|
- expr: |
|
||||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||||
|
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||||
|
)
|
||||||
record: node_namespace_pod_container:container_memory_cache
|
record: node_namespace_pod_container:container_memory_cache
|
||||||
- expr: |
|
- expr: |
|
||||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||||
|
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||||
|
)
|
||||||
record: node_namespace_pod_container:container_memory_swap
|
record: node_namespace_pod_container:container_memory_swap
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||||
@@ -139,35 +149,39 @@ spec:
|
|||||||
)
|
)
|
||||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(
|
max by (cluster, namespace, workload, pod) (
|
||||||
label_replace(
|
label_replace(
|
||||||
label_replace(
|
label_replace(
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||||
"replicaset", "$1", "owner_name", "(.*)"
|
"replicaset", "$1", "owner_name", "(.*)"
|
||||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||||
|
1, max by (replicaset, namespace, owner_name) (
|
||||||
|
kube_replicaset_owner{job="kube-state-metrics"}
|
||||||
|
)
|
||||||
|
),
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
)
|
)
|
||||||
) by (cluster, namespace, workload, pod)
|
)
|
||||||
labels:
|
labels:
|
||||||
workload_type: deployment
|
workload_type: deployment
|
||||||
record: mixin_pod_workload
|
record: mixin_pod_workload
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(
|
max by (cluster, namespace, workload, pod) (
|
||||||
label_replace(
|
label_replace(
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
)
|
)
|
||||||
) by (cluster, namespace, workload, pod)
|
)
|
||||||
labels:
|
labels:
|
||||||
workload_type: daemonset
|
workload_type: daemonset
|
||||||
record: mixin_pod_workload
|
record: mixin_pod_workload
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(
|
max by (cluster, namespace, workload, pod) (
|
||||||
label_replace(
|
label_replace(
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
)
|
)
|
||||||
) by (cluster, namespace, workload, pod)
|
)
|
||||||
labels:
|
labels:
|
||||||
workload_type: statefulset
|
workload_type: statefulset
|
||||||
record: mixin_pod_workload
|
record: mixin_pod_workload
|
||||||
@@ -221,10 +235,13 @@ spec:
|
|||||||
- name: node.rules
|
- name: node.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(min(kube_pod_info) by (cluster, node))
|
sum(min(kube_pod_info{node!=""}) by (cluster, node))
|
||||||
record: ':kube_pod_info_node_count:'
|
record: ':kube_pod_info_node_count:'
|
||||||
- expr: |
|
- expr: |
|
||||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
topk by(namespace, pod) (1,
|
||||||
|
max by (node, namespace, pod) (
|
||||||
|
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||||
|
))
|
||||||
record: 'node_namespace_pod:kube_pod_info:'
|
record: 'node_namespace_pod:kube_pod_info:'
|
||||||
- expr: |
|
- expr: |
|
||||||
count by (cluster, node) (sum by (node, cpu) (
|
count by (cluster, node) (sum by (node, cpu) (
|
||||||
@@ -445,7 +462,13 @@ spec:
|
|||||||
state for longer than 15 minutes.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
sum by (namespace, pod) (
|
||||||
|
max by(namespace, pod) (
|
||||||
|
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
|
||||||
|
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
|
||||||
|
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
|
||||||
|
)
|
||||||
|
) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -1023,6 +1046,17 @@ spec:
|
|||||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
- alert: AggregatedAPIErrors
|
||||||
|
annotations:
|
||||||
|
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
||||||
|
reported errors. The number of errors have increased for it in the past
|
||||||
|
five minutes. High values indicate that the availability of the service
|
||||||
|
changes too often.
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||||
|
expr: |
|
||||||
|
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- alert: KubeAPIDown
|
- alert: KubeAPIDown
|
||||||
annotations:
|
annotations:
|
||||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||||
|
|||||||
Reference in New Issue
Block a user