add etcd2 alerts
This commit is contained in:
121
assets/alerts/etcd2.rules
Normal file
121
assets/alerts/etcd2.rules
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
### General cluster availability ###
|
||||||
|
|
||||||
|
# alert if another failed peer will result in an unavailable cluster
|
||||||
|
ALERT InsufficientPeers
|
||||||
|
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
||||||
|
FOR 3m
|
||||||
|
LABELS {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "Etcd cluster small",
|
||||||
|
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||||
|
}
|
||||||
|
|
||||||
|
### HTTP requests alerts ###
|
||||||
|
|
||||||
|
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||||
|
ALERT HighNumberOfFailedHTTPRequests
|
||||||
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||||
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "a high number of HTTP requests are failing",
|
||||||
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||||
|
ALERT HighNumberOfFailedHTTPRequests
|
||||||
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||||
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||||
|
FOR 5m
|
||||||
|
LABELS {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "a high number of HTTP requests are failing",
|
||||||
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# alert if 50% of requests get a 4xx response
|
||||||
|
ALERT HighNumberOfFailedHTTPRequests
|
||||||
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
||||||
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "a high number of HTTP requests are failing",
|
||||||
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||||
|
ALERT HTTPRequestsSlow
|
||||||
|
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "slow HTTP requests",
|
||||||
|
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||||
|
}
|
||||||
|
|
||||||
|
### File descriptor alerts ###
|
||||||
|
|
||||||
|
instance:fd_utilization = process_open_fds / process_max_fds
|
||||||
|
|
||||||
|
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||||
|
ALERT FdExhaustionClose
|
||||||
|
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "file descriptors soon exhausted",
|
||||||
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||||
|
}
|
||||||
|
|
||||||
|
# alert if file descriptors are likely to exhaust within the next hour
|
||||||
|
ALERT FdExhaustionClose
|
||||||
|
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "file descriptors soon exhausted",
|
||||||
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||||
|
}
|
||||||
|
|
||||||
|
### etcd proposal alerts ###
|
||||||
|
|
||||||
|
# alert if there are several failed proposals within an hour
|
||||||
|
ALERT HighNumberOfFailedProposals
|
||||||
|
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
||||||
|
LABELS {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||||
|
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||||
|
}
|
||||||
|
|
||||||
|
### etcd disk io latency alerts ###
|
||||||
|
|
||||||
|
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||||
|
ALERT HighFsyncDurations
|
||||||
|
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||||
|
FOR 10m
|
||||||
|
LABELS {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "high fsync durations",
|
||||||
|
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
||||||
|
}
|
@@ -1,5 +1,57 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
data:
|
data:
|
||||||
|
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
|
||||||
|
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
|
||||||
|
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
|
||||||
|
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
|
||||||
|
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
|
||||||
|
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
|
||||||
|
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||||
|
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
|
||||||
|
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
|
||||||
|
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||||
|
\ summary = \"a high number of HTTP requests are failing\",\n description
|
||||||
|
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||||
|
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
|
||||||
|
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||||
|
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
|
||||||
|
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||||
|
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||||
|
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||||
|
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||||
|
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
|
||||||
|
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
|
||||||
|
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||||
|
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||||
|
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||||
|
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
|
||||||
|
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
|
||||||
|
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
|
||||||
|
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
|
||||||
|
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
|
||||||
|
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
|
||||||
|
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
|
||||||
|
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
|
||||||
|
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
|
||||||
|
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
|
||||||
|
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
|
||||||
|
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
|
||||||
|
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
|
||||||
|
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
|
||||||
|
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||||
|
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
|
||||||
|
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
|
||||||
|
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
|
||||||
|
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
|
||||||
|
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
|
||||||
|
= \"a high number of failed proposals within the etcd cluster are happening\",\n
|
||||||
|
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
|
||||||
|
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
|
||||||
|
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
|
||||||
|
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
|
||||||
|
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||||
|
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
|
||||||
|
$labels.instance }} fync durations are high\",\n }\n"
|
||||||
kubernetes.rules: |+
|
kubernetes.rules: |+
|
||||||
### Container resources ###
|
### Container resources ###
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user