manifests: Regenerate files
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -65,122 +65,361 @@ spec:
|
|||||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||||
)
|
)
|
||||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||||
- name: kube-apiserver-error
|
|
||||||
rules:
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate5m
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate30m
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate1h
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate2h
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate6h
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate1d
|
|
||||||
- expr: |
|
|
||||||
sum by (status_class) (
|
|
||||||
label_replace(
|
|
||||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
|
||||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class:apiserver_request_total:rate3d
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
|
||||||
- expr: |
|
|
||||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
|
||||||
/
|
|
||||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
|
||||||
labels:
|
|
||||||
job: apiserver
|
|
||||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
|
||||||
- name: kube-apiserver.rules
|
- name: kube-apiserver.rules
|
||||||
rules:
|
rules:
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate1d
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate1h
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate2h
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate30m
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate3d
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate5m
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:burnrate6h
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate1d
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate1h
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate2h
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate30m
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate3d
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate5m
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||||
|
-
|
||||||
|
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:burnrate6h
|
||||||
|
- expr: |
|
||||||
|
1 - (
|
||||||
|
(
|
||||||
|
# write too slow
|
||||||
|
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||||
|
-
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||||
|
) +
|
||||||
|
(
|
||||||
|
# read too slow
|
||||||
|
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
# errors
|
||||||
|
sum(code:apiserver_request_total:increase30d{code=~"5.."})
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(code:apiserver_request_total:increase30d)
|
||||||
|
labels:
|
||||||
|
verb: all
|
||||||
|
record: apiserver_request:availability30d
|
||||||
|
- expr: |
|
||||||
|
1 - (
|
||||||
|
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||||
|
-
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."})
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(code:apiserver_request_total:increase30d{verb="read"})
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: apiserver_request:availability30d
|
||||||
|
- expr: |
|
||||||
|
1 - (
|
||||||
|
(
|
||||||
|
# too slow
|
||||||
|
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||||
|
-
|
||||||
|
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||||
|
)
|
||||||
|
+
|
||||||
|
# errors
|
||||||
|
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."})
|
||||||
|
)
|
||||||
|
/
|
||||||
|
sum(code:apiserver_request_total:increase30d{verb="write"})
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: apiserver_request:availability30d
|
||||||
|
- expr: |
|
||||||
|
sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: code:apiserver_request_total:increase30d
|
||||||
|
- expr: |
|
||||||
|
sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: code:apiserver_request_total:increase30d
|
||||||
|
- expr: |
|
||||||
|
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||||
|
labels:
|
||||||
|
verb: read
|
||||||
|
record: code_resource:apiserver_request_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||||
|
labels:
|
||||||
|
verb: write
|
||||||
|
record: code_resource:apiserver_request_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
verb: read
|
||||||
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
|
- expr: |
|
||||||
|
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||||
|
labels:
|
||||||
|
quantile: "0.99"
|
||||||
|
verb: write
|
||||||
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||||
/
|
/
|
||||||
@@ -628,7 +867,7 @@ spec:
|
|||||||
)
|
)
|
||||||
or
|
or
|
||||||
(
|
(
|
||||||
node_timex_offset_seconds < 0.05
|
node_timex_offset_seconds < -0.05
|
||||||
and
|
and
|
||||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||||
)
|
)
|
||||||
@@ -791,7 +1030,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||||
for: 10m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeCronJobRunning
|
- alert: KubeCronJobRunning
|
||||||
@@ -865,11 +1104,11 @@ spec:
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemOvercommit
|
- alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||||
tolerate node failure.
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||||
/
|
/
|
||||||
@@ -881,10 +1120,10 @@ spec:
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||||
/
|
/
|
||||||
@@ -893,10 +1132,10 @@ spec:
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemOvercommit
|
- alert: KubeMemoryQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Namespaces.
|
message: Cluster has overcommitted memory resource requests for Namespaces.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||||
/
|
/
|
||||||
@@ -934,12 +1173,12 @@ spec:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- name: kubernetes-storage
|
- name: kubernetes-storage
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeUsageCritical
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||||
}} free.
|
}} free.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||||
/
|
/
|
||||||
@@ -948,12 +1187,12 @@ spec:
|
|||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePersistentVolumeFullInFourDays
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
||||||
days. Currently {{ $value | humanizePercentage }} is available.
|
days. Currently {{ $value | humanizePercentage }} is available.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||||
@@ -964,7 +1203,7 @@ spec:
|
|||||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: warning
|
||||||
- alert: KubePersistentVolumeErrors
|
- alert: KubePersistentVolumeErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
||||||
@@ -1000,47 +1239,51 @@ spec:
|
|||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: kube-apiserver-error-alerts
|
- name: kube-apiserver-slos
|
||||||
rules:
|
rules:
|
||||||
- alert: ErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: 'High requests error budget burn for job=apiserver (current value:
|
message: The API server is burning too much error budget
|
||||||
{{ $value }})'
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
|
||||||
expr: |
|
expr: |
|
||||||
(
|
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
|
and
|
||||||
and
|
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
|
for: 2m
|
||||||
)
|
|
||||||
or
|
|
||||||
(
|
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
|
|
||||||
and
|
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
|
|
||||||
)
|
|
||||||
labels:
|
labels:
|
||||||
job: apiserver
|
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: 'High requests error budget burn for job=apiserver (current value:
|
message: The API server is burning too much error budget
|
||||||
{{ $value }})'
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
|
||||||
expr: |
|
expr: |
|
||||||
(
|
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
|
and
|
||||||
and
|
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
|
for: 15m
|
||||||
)
|
labels:
|
||||||
or
|
severity: critical
|
||||||
(
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
|
annotations:
|
||||||
and
|
message: The API server is burning too much error budget
|
||||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
)
|
expr: |
|
||||||
|
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||||
|
and
|
||||||
|
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
|
annotations:
|
||||||
|
message: The API server is burning too much error budget
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
|
expr: |
|
||||||
|
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||||
|
and
|
||||||
|
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||||
|
for: 3h
|
||||||
labels:
|
labels:
|
||||||
job: apiserver
|
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: kubernetes-system-apiserver
|
- name: kubernetes-system-apiserver
|
||||||
rules:
|
rules:
|
||||||
|
|||||||
Reference in New Issue
Block a user