Remove rules that have been migrated to kubernetes-mixins
This commit is contained in:
@@ -1,13 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: kube-controller-manager.rules
|
|
||||||
rules:
|
|
||||||
- alert: K8SControllerManagerDown
|
|
||||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: There is no running K8S controller manager. Deployments and replication
|
|
||||||
controllers are not making progress.
|
|
||||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
||||||
summary: Controller manager is down
|
|
@@ -1,58 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: kube-scheduler.rules
|
|
||||||
rules:
|
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.99"
|
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.9"
|
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.5"
|
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.99"
|
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.9"
|
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.5"
|
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.99"
|
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.9"
|
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.5"
|
|
||||||
- alert: K8SSchedulerDown
|
|
||||||
expr: absent(up{job="kube-scheduler"} == 1)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: There is no running K8S scheduler. New pods are not being assigned
|
|
||||||
to nodes.
|
|
||||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
|
||||||
summary: Scheduler is down
|
|
@@ -1,59 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: kube-state-metrics.rules
|
|
||||||
rules:
|
|
||||||
- alert: DeploymentGenerationMismatch
|
|
||||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Observed deployment generation does not match expected one for
|
|
||||||
deployment {{$labels.namespace}}/{{$labels.deployment}}
|
|
||||||
summary: Deployment is outdated
|
|
||||||
- alert: DeploymentReplicasNotUpdated
|
|
||||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
|
||||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
|
||||||
unless (kube_deployment_spec_paused == 1)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}}
|
|
||||||
summary: Deployment replicas are outdated
|
|
||||||
- alert: DaemonSetRolloutStuck
|
|
||||||
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
|
||||||
* 100 < 100
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
|
||||||
set {{$labels.namespace}}/{{$labels.daemonset}}
|
|
||||||
summary: DaemonSet is missing pods
|
|
||||||
- alert: K8SDaemonSetsNotScheduled
|
|
||||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
|
||||||
> 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: A number of daemonsets are not scheduled.
|
|
||||||
summary: Daemonsets are not scheduled correctly
|
|
||||||
- alert: DaemonSetsMissScheduled
|
|
||||||
expr: kube_daemonset_status_number_misscheduled > 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: A number of daemonsets are running where they are not supposed
|
|
||||||
to run.
|
|
||||||
summary: Daemonsets are not scheduled correctly
|
|
||||||
- alert: PodFrequentlyRestarting
|
|
||||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}}
|
|
||||||
times within the last hour
|
|
||||||
summary: Pod is restarting frequently
|
|
@@ -1,48 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: kubelet.rules
|
|
||||||
rules:
|
|
||||||
- alert: K8SNodeNotReady
|
|
||||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
|
||||||
or has set itself to NotReady, for more than an hour
|
|
||||||
summary: Node status is NotReady
|
|
||||||
- alert: K8SManyNodesNotReady
|
|
||||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
|
||||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
|
||||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) * 100 > 20
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
|
||||||
- alert: K8SKubeletDown
|
|
||||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
||||||
summary: Prometheus failed to scrape
|
|
||||||
- alert: K8SKubeletDown
|
|
||||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
|
||||||
* 100 > 10
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
|
||||||
have disappeared from service discovery.
|
|
||||||
summary: Many Kubelets cannot be scraped
|
|
||||||
- alert: K8SKubeletTooManyPods
|
|
||||||
expr: kubelet_running_pod_count > 100
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
|
||||||
to the limit of 110
|
|
||||||
summary: Kubelet is close to pod limit
|
|
@@ -1,106 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: kubernetes.rules
|
|
||||||
rules:
|
|
||||||
- record: pod_name:container_memory_usage_bytes:sum
|
|
||||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
||||||
(pod_name)
|
|
||||||
- record: pod_name:container_spec_cpu_shares:sum
|
|
||||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
||||||
- record: pod_name:container_cpu_usage:sum
|
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
||||||
BY (pod_name)
|
|
||||||
- record: pod_name:container_fs_usage_bytes:sum
|
|
||||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
||||||
- record: namespace:container_memory_usage_bytes:sum
|
|
||||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
|
||||||
- record: namespace:container_spec_cpu_shares:sum
|
|
||||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
|
||||||
- record: namespace:container_cpu_usage:sum
|
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
|
||||||
BY (namespace)
|
|
||||||
- record: cluster:memory_usage:ratio
|
|
||||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
||||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
||||||
- record: cluster:container_spec_cpu_shares:ratio
|
|
||||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
|
||||||
/ sum(machine_cpu_cores)
|
|
||||||
- record: cluster:container_cpu_usage:ratio
|
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
||||||
/ sum(machine_cpu_cores)
|
|
||||||
- record: apiserver_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
|
||||||
1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.99"
|
|
||||||
- record: apiserver_latency:quantile_seconds
|
|
||||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
|
||||||
1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.9"
|
|
||||||
- record: apiserver_latency_seconds:quantile
|
|
||||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
|
||||||
1e+06
|
|
||||||
labels:
|
|
||||||
quantile: "0.5"
|
|
||||||
- alert: APIServerLatencyHigh
|
|
||||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
||||||
> 1
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
||||||
for {{$labels.verb}} {{$labels.resource}}
|
|
||||||
summary: API server high latency
|
|
||||||
- alert: APIServerLatencyHigh
|
|
||||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
||||||
> 4
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
||||||
for {{$labels.verb}} {{$labels.resource}}
|
|
||||||
summary: API server high latency
|
|
||||||
- alert: APIServerErrorsHigh
|
|
||||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
||||||
* 100 > 2
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: API server returns errors for {{ $value }}% of requests
|
|
||||||
summary: API server request errors
|
|
||||||
- alert: APIServerErrorsHigh
|
|
||||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
||||||
* 100 > 5
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: API server returns errors for {{ $value }}% of requests
|
|
||||||
- alert: K8SApiserverDown
|
|
||||||
expr: absent(up{job="apiserver"} == 1)
|
|
||||||
for: 20m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: No API servers are reachable or all have disappeared from service
|
|
||||||
discovery
|
|
||||||
summary: No API servers are reachable
|
|
||||||
|
|
||||||
- alert: K8sCertificateExpirationNotice
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
|
||||||
summary: Kubernetes API Certificate is expiering soon
|
|
||||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
|
||||||
|
|
||||||
- alert: K8sCertificateExpirationNotice
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: Kubernetes API Certificate is expiring in less than 1 day
|
|
||||||
summary: Kubernetes API Certificate is expiering
|
|
||||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
|
Reference in New Issue
Block a user