manifests: Regenerate files
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -65,122 +65,361 @@ spec:
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
- name: kube-apiserver-error
|
||||
rules:
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{code=~"5.."})
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
1 - (
|
||||
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."})
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."})
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
@@ -628,7 +867,7 @@ spec:
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < 0.05
|
||||
node_timex_offset_seconds < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
@@ -791,7 +1030,7 @@ spec:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
for: 10m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCronJobRunning
|
||||
@@ -865,11 +1104,11 @@ spec:
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemOvercommit
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
/
|
||||
@@ -881,10 +1120,10 @@ spec:
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUOvercommit
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||
/
|
||||
@@ -893,10 +1132,10 @@ spec:
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemOvercommit
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||
/
|
||||
@@ -934,12 +1173,12 @@ spec:
|
||||
severity: warning
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeUsageCritical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
/
|
||||
@@ -948,12 +1187,12 @@ spec:
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFullInFourDays
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
||||
days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
@@ -964,7 +1203,7 @@ spec:
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
||||
@@ -1000,47 +1239,51 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-error-alerts
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
- alert: ErrorBudgetBurn
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
|
||||
)
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: critical
|
||||
- alert: ErrorBudgetBurn
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
|
||||
)
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: warning
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
|
||||
Reference in New Issue
Block a user