jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
This commit is contained in:
@@ -18,7 +18,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "master"
|
"version": "release-0.2"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
|
|||||||
@@ -72,8 +72,8 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
|
"version": "a132ade95740f9364e477ae8e730eabd650d14cb",
|
||||||
"sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU="
|
"sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "node-mixin",
|
"name": "node-mixin",
|
||||||
|
|||||||
@@ -269,7 +269,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))",
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\"}[5m])) by (verb, le))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{verb}}",
|
"legendFormat": "{{verb}}",
|
||||||
@@ -22649,7 +22649,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
@@ -22657,7 +22657,7 @@ items:
|
|||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
@@ -22665,7 +22665,7 @@ items:
|
|||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
|
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
@@ -24915,7 +24915,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by(container) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
|
"expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "Current: {{ container }}",
|
"legendFormat": "Current: {{ container }}",
|
||||||
|
|||||||
@@ -40,10 +40,10 @@ spec:
|
|||||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||||
record: instance:node_vmstat_pgmajfault:rate1m
|
record: instance:node_vmstat_pgmajfault:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||||
- expr: |
|
- expr: |
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
@@ -68,17 +68,22 @@ spec:
|
|||||||
- name: kube-apiserver.rules
|
- name: kube-apiserver.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||||
|
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||||
|
- expr: |
|
||||||
|
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
@@ -431,7 +436,7 @@ spec:
|
|||||||
state for longer than 15 minutes.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
|
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -759,12 +764,26 @@ spec:
|
|||||||
rules:
|
rules:
|
||||||
- alert: KubeAPILatencyHigh
|
- alert: KubeAPILatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
message: The API server has an abnormal latency of {{ $value }} seconds for
|
||||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
{{ $labels.verb }} {{ $labels.resource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
|
(
|
||||||
for: 10m
|
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
|
||||||
|
>
|
||||||
|
on (verb) group_left()
|
||||||
|
(
|
||||||
|
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||||
|
+
|
||||||
|
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||||
|
)
|
||||||
|
) > on (verb) group_left()
|
||||||
|
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||||
|
and on (verb,resource)
|
||||||
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
|
||||||
|
>
|
||||||
|
1
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeAPILatencyHigh
|
- alert: KubeAPILatencyHigh
|
||||||
@@ -773,7 +792,7 @@ spec:
|
|||||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|||||||
Reference in New Issue
Block a user