jsonnet: pin kubernetes-mixin version

Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
Damien Grisonnet
2020-05-26 18:13:30 +02:00
parent e8e1109d98
commit 3f653b6d3e
4 changed files with 57 additions and 23 deletions

View File

@@ -18,7 +18,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "master" "version": "release-0.3"
}, },
{ {
"name": "grafana", "name": "grafana",

View File

@@ -72,8 +72,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "68f82d2a428d91df57e9af43739981a6a8ede897", "version": "7bf9a2a321356a7625509fe458132c26b2e33b29",
"sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM=" "sum": "4c1hfhiHdWmR80Wzw8ntr9Ef7z2JZHYNUU25+RKi8yw="
}, },
{ {
"name": "node-mixin", "name": "node-mixin",

View File

@@ -22944,7 +22944,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,
@@ -22952,7 +22952,7 @@ items:
"refId": "A" "refId": "A"
}, },
{ {
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,
@@ -22960,7 +22960,7 @@ items:
"refId": "B" "refId": "B"
}, },
{ {
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,

View File

@@ -40,10 +40,10 @@ spec:
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m record: instance:node_vmstat_pgmajfault:rate1m
- expr: | - expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m record: instance_device:node_disk_io_time_seconds:rate1m
- expr: | - expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: | - expr: |
sum without (device) ( sum without (device) (
@@ -95,23 +95,33 @@ spec:
- expr: | - expr: |
sum by (cluster, namespace, pod, container) ( sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr: | - expr: |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: | - expr: |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss record: node_namespace_pod_container:container_memory_rss
- expr: | - expr: |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache record: node_namespace_pod_container:container_memory_cache
- expr: | - expr: |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap record: node_namespace_pod_container:container_memory_swap
- expr: | - expr: |
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
@@ -139,35 +149,39 @@ spec:
) )
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)" "replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (cluster, namespace, workload, pod) )
labels: labels:
workload_type: deployment workload_type: deployment
record: mixin_pod_workload record: mixin_pod_workload
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (cluster, namespace, workload, pod) )
labels: labels:
workload_type: daemonset workload_type: daemonset
record: mixin_pod_workload record: mixin_pod_workload
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (cluster, namespace, workload, pod) )
labels: labels:
workload_type: statefulset workload_type: statefulset
record: mixin_pod_workload record: mixin_pod_workload
@@ -221,10 +235,13 @@ spec:
- name: node.rules - name: node.rules
rules: rules:
- expr: | - expr: |
sum(min(kube_pod_info) by (cluster, node)) sum(min(kube_pod_info{node!=""}) by (cluster, node))
record: ':kube_pod_info_node_count:' record: ':kube_pod_info_node_count:'
- expr: | - expr: |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) topk by(namespace, pod) (1,
max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:' record: 'node_namespace_pod:kube_pod_info:'
- expr: | - expr: |
count by (cluster, node) (sum by (node, cpu) ( count by (cluster, node) (sum by (node, cpu) (
@@ -445,7 +462,13 @@ spec:
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 sum by (namespace, pod) (
max by(namespace, pod) (
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@@ -1023,6 +1046,17 @@ spec:
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels: labels:
severity: critical severity: critical
- alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
reported errors. The number of errors have increased for it in the past
five minutes. High values indicate that the availability of the service
changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
severity: warning
- alert: KubeAPIDown - alert: KubeAPIDown
annotations: annotations:
message: KubeAPI has disappeared from Prometheus target discovery. message: KubeAPI has disappeared from Prometheus target discovery.