kube-prometheus/assets/prometheus/rules/etcd2.rules

### General cluster availability ###

# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
  IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
  FOR 3m
  LABELS {
    severity = "critical"
  }
  ANNOTATIONS {
    summary = "Etcd cluster small",
    description = "If one more etcd peer goes down the cluster will be unavailable",
  }

### HTTP requests alerts ###

# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
  FOR 10m
  LABELS {
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "a high number of HTTP requests are failing",
    description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  }

# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
  FOR 5m
  LABELS {
    severity = "critical"
  }
  ANNOTATIONS {
    summary = "a high number of HTTP requests are failing",
    description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  }

# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
  IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
    / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
  FOR 10m
  LABELS {
    severity = "critical"
  }
  ANNOTATIONS {
    summary = "a high number of HTTP requests are failing",
    description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
  }

# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
  IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
  FOR 10m
  LABELS {
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "slow HTTP requests",
    description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
  }

### File descriptor alerts ###

instance:fd_utilization = process_open_fds / process_max_fds

# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  FOR 10m
  LABELS {
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "file descriptors soon exhausted",
    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
  }

# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
  FOR 10m
  LABELS {
    severity = "critical"
  }
  ANNOTATIONS {
    summary = "file descriptors soon exhausted",
    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
  }

### etcd proposal alerts ###

# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
  IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
  LABELS {
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "a high number of failed proposals within the etcd cluster are happening",
    description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
  }

### etcd disk io latency alerts ###

# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
  IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
  FOR 10m
  LABELS {
    severity = "warning"
  }
  ANNOTATIONS {
    summary = "high fsync durations",
    description = "ectd instance {{ $labels.instance }} fync durations are high",
  }