kube-prometheus: add alerting rules
This commit is contained in:
97
assets/prometheus/rules/general.rules
Normal file
97
assets/prometheus/rules/general.rules
Normal file
@@ -0,0 +1,97 @@
|
||||
### Up Alerting ###
|
||||
|
||||
Alert TargetDown
|
||||
IF up == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "target is down",
|
||||
description = "A target of type {{ $labels.job }} is down."
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
ALERT TooManyOpenFiles
|
||||
IF 100*process_open_fds / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "too many open file descriptors",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "too many open file descriptors",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
### Contrack alerts ###
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT ConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
ALERT ConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
ALERT ConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
Reference in New Issue
Block a user