kube-prometheus: drop conntrack alerts and direct up alerts
This commit is contained in:
@@ -1,14 +1,14 @@
|
|||||||
### Up Alerting ###
|
### Up Alerting ###
|
||||||
|
|
||||||
Alert TargetDown
|
Alert TargetDown
|
||||||
IF up == 0
|
IF 100 * (count(up == 0) / count(up)) > 3
|
||||||
FOR 10m
|
FOR 10m
|
||||||
LABELS {
|
LABELS {
|
||||||
severity = "warning"
|
severity = "warning"
|
||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "target is down",
|
summary = "Targets are down",
|
||||||
description = "A target of type {{ $labels.job }} is down."
|
description = "More than {{ $value }}% of targets are down."
|
||||||
}
|
}
|
||||||
|
|
||||||
### Dead man's switch ###
|
### Dead man's switch ###
|
||||||
@@ -25,26 +25,15 @@ ALERT DeadMansSwitch
|
|||||||
|
|
||||||
### File descriptor alerts ###
|
### File descriptor alerts ###
|
||||||
|
|
||||||
ALERT TooManyOpenFiles
|
ALERT TooManyOpenFileDescriptors
|
||||||
IF 100*process_open_fds / process_max_fds > 50
|
IF 100 * (process_open_fds / process_max_fds) > 95
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "too many open file descriptors",
|
|
||||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT K8STooManyOpenFiles
|
|
||||||
IF 100*process_open_fds / process_max_fds > 80
|
|
||||||
FOR 10m
|
FOR 10m
|
||||||
LABELS {
|
LABELS {
|
||||||
severity = "critical"
|
severity = "critical"
|
||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "too many open file descriptors",
|
summary = "too many open file descriptors",
|
||||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
|
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
|
||||||
}
|
}
|
||||||
|
|
||||||
instance:fd_utilization = process_open_fds / process_max_fds
|
instance:fd_utilization = process_open_fds / process_max_fds
|
||||||
@@ -58,7 +47,7 @@ ALERT FdExhaustionClose
|
|||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "file descriptors soon exhausted",
|
summary = "file descriptors soon exhausted",
|
||||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
|
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||||
}
|
}
|
||||||
|
|
||||||
# alert if file descriptors are likely to exhaust within the next hour
|
# alert if file descriptors are likely to exhaust within the next hour
|
||||||
@@ -70,40 +59,5 @@ ALERT FdExhaustionClose
|
|||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "file descriptors soon exhausted",
|
summary = "file descriptors soon exhausted",
|
||||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
|
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||||
}
|
|
||||||
|
|
||||||
### Contrack alerts ###
|
|
||||||
|
|
||||||
# To catch the conntrack sysctl de-tuning when it happens
|
|
||||||
ALERT ConntrackTuningMissing
|
|
||||||
IF node_nf_conntrack_udp_timeout > 10
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning",
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Node does not have the correct conntrack tunings",
|
|
||||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT ConntrackTableFull
|
|
||||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
|
||||||
FOR 10m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Number of tracked connections is near the limit",
|
|
||||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT ConntrackTableFull
|
|
||||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
|
||||||
LABELS {
|
|
||||||
severity = "critical"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Number of tracked connections is near the limit",
|
|
||||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
|
||||||
}
|
}
|
||||||
|
@@ -1,15 +1,3 @@
|
|||||||
ALERT K8SApiserverDown
|
|
||||||
IF up{job="apiserver"} == 0
|
|
||||||
FOR 15m
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "API server unreachable",
|
|
||||||
description = "An API server could not be scraped.",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Disable for non HA kubernetes setups.
|
|
||||||
ALERT K8SApiserverDown
|
ALERT K8SApiserverDown
|
||||||
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
|
IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"}))
|
||||||
FOR 5m
|
FOR 5m
|
||||||
@@ -18,7 +6,7 @@ ALERT K8SApiserverDown
|
|||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "API server unreachable",
|
summary = "API server unreachable",
|
||||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Some verbs excluded because they are expected to be long-lasting:
|
# Some verbs excluded because they are expected to be long-lasting:
|
||||||
|
@@ -1,14 +1,3 @@
|
|||||||
ALERT K8SNodeDown
|
|
||||||
IF up{job="kubelet"} == 0
|
|
||||||
FOR 1h
|
|
||||||
LABELS {
|
|
||||||
severity = "warning"
|
|
||||||
}
|
|
||||||
ANNOTATIONS {
|
|
||||||
summary = "Kubelet cannot be scraped",
|
|
||||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
|
|
||||||
}
|
|
||||||
|
|
||||||
ALERT K8SNodeNotReady
|
ALERT K8SNodeNotReady
|
||||||
IF kube_node_status_ready{condition="true"} == 0
|
IF kube_node_status_ready{condition="true"} == 0
|
||||||
FOR 1h
|
FOR 1h
|
||||||
@@ -39,15 +28,25 @@ ALERT K8SManyNodesNotReady
|
|||||||
}
|
}
|
||||||
|
|
||||||
ALERT K8SKubeletDown
|
ALERT K8SKubeletDown
|
||||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03
|
||||||
FOR 1h
|
FOR 1h
|
||||||
LABELS {
|
LABELS {
|
||||||
service = "k8s",
|
severity = "warning",
|
||||||
severity = "critical"
|
|
||||||
}
|
}
|
||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "Many Kubelets cannot be scraped",
|
summary = "Many Kubelets cannot be scraped",
|
||||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
|
||||||
|
}
|
||||||
|
|
||||||
|
ALERT K8SKubeletDown
|
||||||
|
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||||
|
FOR 1h
|
||||||
|
LABELS {
|
||||||
|
severity = "critical",
|
||||||
|
}
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "Many Kubelets cannot be scraped",
|
||||||
|
description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||||
}
|
}
|
||||||
|
|
||||||
ALERT K8SKubeletTooManyPods
|
ALERT K8SKubeletTooManyPods
|
||||||
|
@@ -625,6 +625,7 @@ data:
|
|||||||
ANNOTATIONS {
|
ANNOTATIONS {
|
||||||
summary = "Scheduler is down",
|
summary = "Scheduler is down",
|
||||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||||
|
runbook = "https://github.com/coreos/tectonic-installer/blob/master/Documentation/troubleshooting/controller-recovery.md#disaster-recovery-of-scheduler-and-controller-manager-pods"
|
||||||
}
|
}
|
||||||
node.rules: |+
|
node.rules: |+
|
||||||
ALERT NodeExporterDown
|
ALERT NodeExporterDown
|
||||||
|
Reference in New Issue
Block a user