For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is actually used, which is unnecessary for generating these manifests. This approach also allows further customization, such as adding labels to the generated manifests.
122 lines
4.1 KiB
Plaintext
122 lines
4.1 KiB
Plaintext
### General cluster availability ###
|
|
|
|
# alert if another failed peer will result in an unavailable cluster
|
|
ALERT InsufficientPeers
|
|
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
|
FOR 3m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "Etcd cluster small",
|
|
description = "If one more etcd peer goes down the cluster will be unavailable",
|
|
}
|
|
|
|
### HTTP requests alerts ###
|
|
|
|
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
|
FOR 5m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if 50% of requests get a 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if the 99th percentile of HTTP requests take more than 150ms
|
|
ALERT HTTPRequestsSlow
|
|
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "slow HTTP requests",
|
|
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
|
}
|
|
|
|
### File descriptor alerts ###
|
|
|
|
instance:fd_utilization = process_open_fds / process_max_fds
|
|
|
|
# alert if file descriptors are likely to exhaust within the next 4 hours
|
|
ALERT FdExhaustionClose
|
|
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "file descriptors soon exhausted",
|
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
|
}
|
|
|
|
# alert if file descriptors are likely to exhaust within the next hour
|
|
ALERT FdExhaustionClose
|
|
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "file descriptors soon exhausted",
|
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
|
}
|
|
|
|
### etcd proposal alerts ###
|
|
|
|
# alert if there are several failed proposals within an hour
|
|
ALERT HighNumberOfFailedProposals
|
|
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of failed proposals within the etcd cluster are happening",
|
|
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
|
}
|
|
|
|
### etcd disk io latency alerts ###
|
|
|
|
# alert if 99th percentile of fsync durations is higher than 500ms
|
|
ALERT HighFsyncDurations
|
|
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "high fsync durations",
|
|
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
|
}
|