add etcd2 alerts

2016-11-01 16:00:54 +01:00
parent bb752d6f56
commit 2e5bcc1671
2 changed files with 173 additions and 0 deletions
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -1,5 +1,57 @@
 apiVersion: v1
 data:
+  etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
+    peer will result in an unavailable cluster\nALERT InsufficientPeers\n  IF count(up{job=\"etcd-k8s\"}
+    == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n  FOR 3m\n  LABELS {\n    severity
+    = \"critical\"\n  }\n  ANNOTATIONS {\n    summary = \"Etcd cluster small\",\n
+    \   description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
+    \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
+    an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
+    \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
+    \   / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
+    0.01\n  FOR 10m\n  LABELS {\n    severity = \"warning\"\n  }\n  ANNOTATIONS {\n
+    \   summary = \"a high number of HTTP requests are failing\",\n    description
+    = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
+    {{ $labels.instance }}\",\n  }\n\n# alert if more than 5% of requests to an HTTP
+    endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
+    \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
+    \n    / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
+    > 0.05\n  FOR 5m\n  LABELS {\n    severity = \"critical\"\n  }\n  ANNOTATIONS
+    {\n    summary = \"a high number of HTTP requests are failing\",\n    description
+    = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
+    {{ $labels.instance }}\",\n  }\n\n# alert if 50% of requests get a 4xx response\nALERT
+    HighNumberOfFailedHTTPRequests\n  IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
+    code=~\"4[0-9]{2}\"}[5m]))\n    / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
+    > 0.5\n  FOR 10m\n  LABELS {\n    severity = \"critical\"\n  }\n  ANNOTATIONS
+    {\n    summary = \"a high number of HTTP requests are failing\",\n    description
+    = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
+    on etcd instance {{ $labels.instance }}\",\n  }\n\n# alert if the 99th percentile
+    of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n  IF histogram_quantile(0.99,
+    rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n  FOR 10m\n  LABELS
+    {\n    severity = \"warning\"\n  }\n  ANNOTATIONS {\n    summary = \"slow HTTP
+    requests\",\n    description = \"on ectd instance {{ $labels.instance }} HTTP
+    requests to {{ $label.method }} are slow\",\n  }\n\n### File descriptor alerts
+    ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
+    if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
+    \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n  FOR 10m\n  LABELS
+    {\n    severity = \"warning\"\n  }\n  ANNOTATIONS {\n    summary = \"file descriptors
+    soon exhausted\",\n    description = \"{{ $labels.job }} instance {{ $labels.instance
+    }} will exhaust in file descriptors soon\",\n  }\n\n# alert if file descriptors
+    are likely to exhaust within the next hour\nALERT FdExhaustionClose\n  IF predict_linear(instance:fd_utilization[10m],
+    3600) > 1\n  FOR 10m\n  LABELS {\n    severity = \"critical\"\n  }\n  ANNOTATIONS
+    {\n    summary = \"file descriptors soon exhausted\",\n    description = \"{{
+    $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
+    soon\",\n  }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
+    proposals within an hour\nALERT HighNumberOfFailedProposals\n  IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
+    > 5\n  LABELS {\n    severity = \"warning\"\n  }\n  ANNOTATIONS {\n    summary
+    = \"a high number of failed proposals within the etcd cluster are happening\",\n
+    \   description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
+    proposal failures within the last hour\",\n  }\n\n### etcd disk io latency alerts
+    ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
+    HighFsyncDurations\n  IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
+    > 0.5\n  FOR 10m\n  LABELS {\n    severity = \"warning\"\n  }\n  ANNOTATIONS {\n
+    \   summary = \"high fsync durations\",\n    description = \"ectd instance {{
+    $labels.instance }} fync durations are high\",\n  }\n"
  kubernetes.rules: |+
    ### Container resources ###