groups:
- name: kubernetes.rules
  rules:
  - record: pod_name:container_memory_usage_bytes:sum
    expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
      (pod_name)
  - record: pod_name:container_spec_cpu_shares:sum
    expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
  - record: pod_name:container_cpu_usage:sum
    expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
      BY (pod_name)
  - record: pod_name:container_fs_usage_bytes:sum
    expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
  - record: namespace:container_memory_usage_bytes:sum
    expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
  - record: namespace:container_spec_cpu_shares:sum
    expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
  - record: namespace:container_cpu_usage:sum
    expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
      BY (namespace)
  - record: cluster:memory_usage:ratio
    expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
      (cluster) / sum(machine_memory_bytes) BY (cluster)
  - record: cluster:container_spec_cpu_shares:ratio
    expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
      / sum(machine_cpu_cores)
  - record: cluster:container_cpu_usage:ratio
    expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
      / sum(machine_cpu_cores)
  - record: apiserver_latency_seconds:quantile
    expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
      1e+06
    labels:
      quantile: "0.99"
  - record: apiserver_latency:quantile_seconds
    expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
      1e+06
    labels:
      quantile: "0.9"
  - record: apiserver_latency_seconds:quantile
    expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
      1e+06
    labels:
      quantile: "0.5"
  - alert: APIServerLatencyHigh
    expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
      > 1
    for: 10m
    labels:
      severity: warning
    annotations:
      description: the API server has a 99th percentile latency of {{ $value }} seconds
        for {{$labels.verb}} {{$labels.resource}}
      summary: API server high latency
  - alert: APIServerLatencyHigh
    expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
      > 4
    for: 10m
    labels:
      severity: critical
    annotations:
      description: the API server has a 99th percentile latency of {{ $value }} seconds
        for {{$labels.verb}} {{$labels.resource}}
      summary: API server high latency
  - alert: APIServerErrorsHigh
    expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
      * 100 > 2
    for: 10m
    labels:
      severity: warning
    annotations:
      description: API server returns errors for {{ $value }}% of requests
      summary: API server request errors
  - alert: APIServerErrorsHigh
    expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
      * 100 > 5
    for: 10m
    labels:
      severity: critical
    annotations:
      description: API server returns errors for {{ $value }}% of requests
  - alert: K8SApiserverDown
    expr: absent(up{job="apiserver"} == 1)
    for: 20m
    labels:
      severity: critical
    annotations:
      description: No API servers are reachable or all have disappeared from service
        discovery
      summary: No API servers are reachable

  - alert: K8sCertificateExpirationNotice
    labels:
      severity: warning
    annotations:
      description: Kubernetes API Certificate is expiring soon (less than 7 days)
      summary: Kubernetes API Certificate is expiering soon
    expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0

  - alert: K8sCertificateExpirationNotice
    labels:
      severity: critical
    annotations:
      description: Kubernetes API Certificate is expiring in less than 1 day
      summary: Kubernetes API Certificate is expiering
    expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0