apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-k8s-rules
  labels:
    role: prometheus-rulefiles
    prometheus: k8s
data:
  alertmanager.rules: |+
    ALERT AlertmanagerConfigInconsistent
      IF   count_values by (service) ("config_hash", alertmanager_config_hash)
         / on(service) group_left
           label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
      FOR 5m
      LABELS {
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "Alertmanager configurations are inconsistent",
        description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
      }
    
    ALERT AlertmanagerDownOrMissing
      IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
         / on(job) group_right
           sum by(job) (up) != 1
      FOR 5m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Alertmanager down or not discovered",
        description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
      }
    
    ALERT FailedReload
      IF alertmanager_config_last_reload_successful == 0
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Alertmanager configuration reload has failed",
        description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
      }
  etcd3.rules: |+
    # general cluster availability
    
    # alert if another failed member will result in an unavailable cluster
    ALERT InsufficientMembers
    IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
    FOR 3m
    LABELS {
      severity = "critical"
    }
    ANNOTATIONS {
      summary = "etcd cluster insufficient members",
      description = "If one more etcd member goes down the cluster will be unavailable",
    }
    
    # etcd leader alerts
    # ==================
    
    # alert if any etcd instance has no leader
    ALERT NoLeader
    IF etcd_server_has_leader{job="etcd"} == 0
    FOR 1m
    LABELS {
      severity = "critical"
    }
    ANNOTATIONS {
      summary = "etcd member has no leader",
      description = "etcd member {{ $labels.instance }} has no leader",
    }
    
    # alert if there are lots of leader changes
    ALERT HighNumberOfLeaderChanges
    IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "a high number of leader changes within the etcd cluster are happening",
      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
    }
    
    # gRPC request alerts
    # ===================
    
    # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
    ALERT HighNumberOfFailedGRPCRequests
    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "a high number of gRPC requests are failing",
      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
    }
    
    # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
    ALERT HighNumberOfFailedGRPCRequests
    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
    FOR 5m
    LABELS {
      severity = "critical"
    }
    ANNOTATIONS {
      summary = "a high number of gRPC requests are failing",
      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
    }
    
    # alert if the 99th percentile of gRPC method calls take more than 150ms
    ALERT GRPCRequestsSlow
    IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
    FOR 10m
    LABELS {
      severity = "critical"
    }
    ANNOTATIONS {
      summary = "slow gRPC requests",
      description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
    }
    
    # HTTP requests alerts
    # ====================
    
    # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
    ALERT HighNumberOfFailedHTTPRequests
    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "a high number of HTTP requests are failing",
      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
    }
    
    # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
    ALERT HighNumberOfFailedHTTPRequests
    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
    FOR 5m
    LABELS {
      severity = "critical"
    }
    ANNOTATIONS {
      summary = "a high number of HTTP requests are failing",
      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
    }
    
    # alert if the 99th percentile of HTTP requests take more than 150ms
    ALERT HTTPRequestsSlow
    IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "slow HTTP requests",
      description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
    }
    
    # etcd member communication alerts
    # ================================
    
    # alert if 99th percentile of round trips take 150ms
    ALERT EtcdMemberCommunicationSlow
    IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "etcd member communication is slow",
      description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
    }
    
    # etcd proposal alerts
    # ====================
    
    # alert if there are several failed proposals within an hour
    ALERT HighNumberOfFailedProposals
    IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "a high number of proposals within the etcd cluster are failing",
      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
    }
    
    # etcd disk io latency alerts
    # ===========================
    
    # alert if 99th percentile of fsync durations is higher than 500ms
    ALERT HighFsyncDurations
    IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "high fsync durations",
      description = "etcd instance {{ $labels.instance }} fync durations are high",
    }
    
    # alert if 99th percentile of commit durations is higher than 250ms
    ALERT HighCommitDurations
    IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
    FOR 10m
    LABELS {
      severity = "warning"
    }
    ANNOTATIONS {
      summary = "high commit durations",
      description = "etcd instance {{ $labels.instance }} commit durations are high",
    }
  general.rules: |+
    ### Up Alerting ###
    
    Alert TargetDown
      IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Targets are down",
        description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
      }
    
    ### Dead man's switch ###
    
    ALERT DeadMansSwitch
      IF vector(1)
      LABELS {
        severity = "none",
      }
      ANNOTATIONS {
        summary = "Alerting DeadMansSwitch",
        description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
      }
    
    ### File descriptor alerts ###
    
    ALERT TooManyOpenFileDescriptors
      IF 100 * (process_open_fds / process_max_fds) > 95
      FOR 10m
      LABELS {
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "too many open file descriptors",
        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
      }
    
    instance:fd_utilization = process_open_fds / process_max_fds
    
    # alert if file descriptors are likely to exhaust within the next 4 hours
    ALERT FdExhaustionClose
      IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "file descriptors soon exhausted",
        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
      }
    
    # alert if file descriptors are likely to exhaust within the next hour
    ALERT FdExhaustionClose
      IF predict_linear(instance:fd_utilization[10m], 3600) > 1
      FOR 10m
      LABELS {
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "file descriptors soon exhausted",
        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
      }
  kube-apiserver.rules: |+
    ALERT K8SApiserverDown
      IF absent(up{job="apiserver"} == 1)
      FOR 5m
      LABELS {
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "API server unreachable",
        description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
      }
    
    # Some verbs excluded because they are expected to be long-lasting:
    # WATCHLIST is long-poll, CONNECT is `kubectl exec`.
    #
    # apiserver_request_latencies' unit is microseconds
    ALERT K8SApiServerLatency
      IF histogram_quantile(
          0.99,
          sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
        ) / 1e6 > 1.0
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Kubernetes apiserver latency is high",
        description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
      }
  kube-controller-manager.rules: |+
    ALERT K8SControllerManagerDown
      IF absent(up{job="kube-controller-manager"} == 1)
      FOR 5m
      LABELS {
        severity = "critical",
      }
      ANNOTATIONS {
        summary = "Controller manager is down",
        description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
        runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
      }
  kubelet.rules: |+
    ALERT K8SNodeNotReady
      IF kube_node_status_condition{condition="Ready", status="true"} == 0
      FOR 1h
      LABELS {
        severity = "warning",
      }
      ANNOTATIONS {
        summary = "Node status is NotReady",
        description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
      }
    
    ALERT K8SManyNodesNotReady
      IF
        count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
        AND
          (
            count(kube_node_status_condition{condition="Ready", status="true"} == 0)
          /
            count(kube_node_status_condition{condition="Ready", status="true"})
          ) > 0.2
      FOR 1m
      LABELS {
        severity = "critical",
      }
      ANNOTATIONS {
        summary = "Many Kubernetes nodes are Not Ready",
        description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
      }
    
    ALERT K8SKubeletDown
      IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
      FOR 1h
      LABELS {
        severity = "warning",
      }
      ANNOTATIONS {
        summary = "Many Kubelets cannot be scraped",
        description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
      }
    
    ALERT K8SKubeletDown
      IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
      FOR 1h
      LABELS {
        severity = "critical",
      }
      ANNOTATIONS {
        summary = "Many Kubelets cannot be scraped",
        description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
      }
    
    ALERT K8SKubeletTooManyPods
      IF kubelet_running_pod_count > 100
      LABELS {
        severity = "warning",
      }
      ANNOTATIONS {
        summary = "Kubelet is close to pod limit",
        description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
      }
  kubernetes.rules: |+
    # NOTE: These rules were kindly contributed by the SoundCloud engineering team.
    
    ### Container resources ###
    
    cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_spec_memory_limit_bytes{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:spec_cpu_shares =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_spec_cpu_shares{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:cpu_usage:rate =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          irate(
            container_cpu_usage_seconds_total{container_name!=""}[5m]
          ),
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_usage:bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_memory_usage_bytes{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_working_set:bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_memory_working_set_bytes{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_rss:bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_memory_rss{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_cache:bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_memory_cache{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:disk_usage:bytes =
      sum by (cluster,namespace,controller,pod_name,container_name) (
        label_replace(
          container_disk_usage_bytes{container_name!=""},
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_pagefaults:rate =
      sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
        label_replace(
          irate(
            container_memory_failures_total{container_name!=""}[5m]
          ),
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    cluster_namespace_controller_pod_container:memory_oom:rate =
      sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
        label_replace(
          irate(
            container_memory_failcnt{container_name!=""}[5m]
          ),
          "controller", "$1",
          "pod_name", "^(.*)-[a-z0-9]+"
        )
      )
    
    ### Cluster resources ###
    
    cluster:memory_allocation:percent =
      100 * sum by (cluster) (
        container_spec_memory_limit_bytes{pod_name!=""}
      ) / sum by (cluster) (
        machine_memory_bytes
      )
    
    cluster:memory_used:percent =
      100 * sum by (cluster) (
        container_memory_usage_bytes{pod_name!=""}
      ) / sum by (cluster) (
        machine_memory_bytes
      )
    
    cluster:cpu_allocation:percent =
      100 * sum by (cluster) (
        container_spec_cpu_shares{pod_name!=""}
      ) / sum by (cluster) (
        container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
      )
    
    cluster:node_cpu_use:percent =
      100 * sum by (cluster) (
        rate(node_cpu{mode!="idle"}[5m])
      ) / sum by (cluster) (
        machine_cpu_cores
      )
    
    ### API latency ###
    
    # Raw metrics are in microseconds. Convert to seconds.
    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
      histogram_quantile(
        0.99,
        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
      ) / 1e6
    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
      histogram_quantile(
        0.9,
        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
      ) / 1e6
    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
      histogram_quantile(
        0.5,
        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
      ) / 1e6
    
    ### Scheduling latency ###
    
    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
      histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
      histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
      histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
    
    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
      histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
      histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
      histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
    
    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
      histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
      histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
      histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
  kube-scheduler.rules: |+
    ALERT K8SSchedulerDown
      IF absent(up{job="kube-scheduler"} == 1)
      FOR 5m
      LABELS {
        severity = "critical",
      }
      ANNOTATIONS {
        summary = "Scheduler is down",
        description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
        runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
      }
  node.rules: |+
    ALERT NodeExporterDown
      IF absent(up{job="node-exporter"} == 1)
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "node-exporter cannot be scraped",
        description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
      }
    
    ALERT K8SNodeOutOfDisk
      IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
      LABELS {
        service = "k8s",
        severity = "critical"
      }
      ANNOTATIONS {
        summary = "Node ran out of disk space.",
        description = "{{ $labels.node }} has run out of disk space.",
      }
    
    ALERT K8SNodeMemoryPressure
      IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
      LABELS {
        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Node is under memory pressure.",
        description = "{{ $labels.node }} is under memory pressure.",
      }
    
    ALERT K8SNodeDiskPressure
      IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
      LABELS {
        service = "k8s",
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Node is under disk pressure.",
        description = "{{ $labels.node }} is under disk pressure.",
      }
  prometheus.rules: |+
    ALERT FailedReload
      IF prometheus_config_last_reload_successful == 0
      FOR 10m
      LABELS {
        severity = "warning"
      }
      ANNOTATIONS {
        summary = "Prometheus configuration reload has failed",
        description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
      }