jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
"version": "release-0.2"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
|
||||
@@ -72,8 +72,8 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
|
||||
"sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU="
|
||||
"version": "a132ade95740f9364e477ae8e730eabd650d14cb",
|
||||
"sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
|
||||
},
|
||||
{
|
||||
"name": "node-mixin",
|
||||
|
||||
@@ -269,7 +269,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\"}[5m])) by (verb, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{verb}}",
|
||||
@@ -22649,7 +22649,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
@@ -22657,7 +22657,7 @@ items:
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
@@ -22665,7 +22665,7 @@ items:
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
@@ -24915,7 +24915,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(container) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
|
||||
"expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Current: {{ container }}",
|
||||
|
||||
@@ -40,10 +40,10 @@ spec:
|
||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
@@ -68,17 +68,22 @@ spec:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
@@ -431,7 +436,7 @@ spec:
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
|
||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -759,12 +764,26 @@ spec:
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
message: The API server has an abnormal latency of {{ $value }} seconds for
|
||||
{{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
|
||||
for: 10m
|
||||
(
|
||||
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
|
||||
>
|
||||
on (verb) group_left()
|
||||
(
|
||||
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
+
|
||||
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
)
|
||||
) > on (verb) group_left()
|
||||
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
and on (verb,resource)
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
|
||||
>
|
||||
1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPILatencyHigh
|
||||
@@ -773,7 +792,7 @@ spec:
|
||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
Reference in New Issue
Block a user