107 lines
4.4 KiB
YAML
107 lines
4.4 KiB
YAML
groups:
|
|
- name: kubernetes.rules
|
|
rules:
|
|
- record: pod_name:container_memory_usage_bytes:sum
|
|
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
(pod_name)
|
|
- record: pod_name:container_spec_cpu_shares:sum
|
|
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
- record: pod_name:container_cpu_usage:sum
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
BY (pod_name)
|
|
- record: pod_name:container_fs_usage_bytes:sum
|
|
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
- record: namespace:container_memory_usage_bytes:sum
|
|
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
|
- record: namespace:container_spec_cpu_shares:sum
|
|
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
|
- record: namespace:container_cpu_usage:sum
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
|
BY (namespace)
|
|
- record: cluster:memory_usage:ratio
|
|
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
- record: cluster:container_spec_cpu_shares:ratio
|
|
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
|
/ sum(machine_cpu_cores)
|
|
- record: cluster:container_cpu_usage:ratio
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
/ sum(machine_cpu_cores)
|
|
- record: apiserver_latency_seconds:quantile
|
|
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: apiserver_latency_seconds:quantile
|
|
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- alert: APIServerLatencyHigh
|
|
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
> 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{$labels.verb}} {{$labels.resource}}
|
|
summary: API server high latency
|
|
- alert: APIServerLatencyHigh
|
|
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
> 4
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{$labels.verb}} {{$labels.resource}}
|
|
summary: API server high latency
|
|
- alert: APIServerErrorsHigh
|
|
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
* 100 > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: API server returns errors for {{ $value }}% of requests
|
|
summary: API server request errors
|
|
- alert: APIServerErrorsHigh
|
|
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
* 100 > 5
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: API server returns errors for {{ $value }}% of requests
|
|
- alert: K8SApiserverDown
|
|
expr: absent(up{job="apiserver"} == 1)
|
|
for: 20m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: No API servers are reachable or all have disappeared from service
|
|
discovery
|
|
summary: No API servers are reachable
|
|
|
|
- alert: K8sCertificateExpirationNotice
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
|
summary: Kubernetes API Certificate is expiering soon
|
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
|
|
|
- alert: K8sCertificateExpirationNotice
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Kubernetes API Certificate is expiring in less than 1 day
|
|
summary: Kubernetes API Certificate is expiering
|
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|