### Up Alerting ### Alert TargetDown IF up == 0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "target is down", description = "A target of type {{ $labels.job }} is down." } ### File descriptor alerts ### ALERT TooManyOpenFiles IF 100*process_open_fds / process_max_fds > 50 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "too many open file descriptors", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", } ALERT K8STooManyOpenFiles IF 100*process_open_fds / process_max_fds > 80 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "file descriptors soon exhausted", description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } ### Contrack alerts ### # To catch the conntrack sysctl de-tuning when it happens ALERT ConntrackTuningMissing IF node_nf_conntrack_udp_timeout > 10 FOR 10m LABELS { severity = "warning", } ANNOTATIONS { summary = "Node does not have the correct conntrack tunings", description = "Nodes keep un-setting the correct tunings, investigate when it happens.", } ALERT ConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } ALERT ConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 LABELS { severity = "critical" } ANNOTATIONS { summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", }