Merge pull request #1267 from paulfantom/runbook_urlk
jsonnet/kube-prometheus: point to runbooks.prometheus-operator.dev
This commit is contained in:
@@ -64,7 +64,7 @@ local defaults = {
|
||||
alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}',
|
||||
alertmanagerClusterLabels: 'namespace,service',
|
||||
alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/alertmanager/%s',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -17,7 +17,7 @@ local defaults = {
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s',
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
},
|
||||
|
||||
@@ -29,7 +29,7 @@ local defaults = {
|
||||
ruleLabels: {},
|
||||
_config: {
|
||||
kubeStateMetricsSelector: 'job="' + defaults.name + '"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/%s',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -11,7 +11,7 @@ local defaults = {
|
||||
_config: {
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
hostNetworkInterfaceSelector: 'device!~"veth.+"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/general/%s',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -30,7 +30,7 @@ local defaults = {
|
||||
nodeExporterSelector: 'job="' + defaults.name + '"',
|
||||
fsSpaceFillingUpCriticalThreshold: 15,
|
||||
diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/node/%s',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -31,7 +31,7 @@ local defaults = {
|
||||
},
|
||||
_config: {
|
||||
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -36,7 +36,7 @@ local defaults = {
|
||||
prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"',
|
||||
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
|
||||
thanosSelector: 'job="thanos-sidecar"',
|
||||
runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s',
|
||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus/%s',
|
||||
},
|
||||
},
|
||||
thanos: {},
|
||||
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -43,7 +43,7 @@ spec:
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |
|
||||
(
|
||||
@@ -58,7 +58,7 @@ spec:
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |
|
||||
min by (namespace,service, integration) (
|
||||
@@ -73,7 +73,7 @@ spec:
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |
|
||||
min by (namespace,service, integration) (
|
||||
@@ -88,7 +88,7 @@ spec:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |
|
||||
count by (namespace,service) (
|
||||
@@ -101,7 +101,7 @@ spec:
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |
|
||||
(
|
||||
@@ -120,7 +120,7 @@ spec:
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |
|
||||
(
|
||||
|
||||
@@ -16,7 +16,7 @@ spec:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/targetdown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||
for: 10m
|
||||
@@ -30,7 +30,7 @@ spec:
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/watchdog
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
@@ -40,7 +40,7 @@ spec:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changin it's status
|
||||
expr: |
|
||||
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
@@ -30,7 +30,7 @@ spec:
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
@@ -43,7 +43,7 @@ spec:
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricsshardingmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: |
|
||||
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||
@@ -53,7 +53,7 @@ spec:
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricsshardsmissing
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
|
||||
|
||||
@@ -15,7 +15,7 @@ spec:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: |
|
||||
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0
|
||||
@@ -27,7 +27,7 @@ spec:
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |
|
||||
sum by (namespace, pod) (
|
||||
@@ -43,7 +43,7 @@ spec:
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||
@@ -55,7 +55,7 @@ spec:
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |
|
||||
(
|
||||
@@ -73,7 +73,7 @@ spec:
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |
|
||||
(
|
||||
@@ -91,7 +91,7 @@ spec:
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||
@@ -103,7 +103,7 @@ spec:
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |
|
||||
(
|
||||
@@ -129,7 +129,7 @@ spec:
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |
|
||||
(
|
||||
@@ -161,7 +161,7 @@ spec:
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: |
|
||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||
@@ -171,7 +171,7 @@ spec:
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
@@ -183,7 +183,7 @@ spec:
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: |
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
@@ -193,7 +193,7 @@ spec:
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion
|
||||
summary: Job did not complete in time
|
||||
expr: |
|
||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
@@ -203,7 +203,7 @@ spec:
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: |
|
||||
kube_job_failed{job="kube-state-metrics"} > 0
|
||||
@@ -213,7 +213,7 @@ spec:
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched descired number of replicas.
|
||||
expr: |
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"}
|
||||
@@ -235,7 +235,7 @@ spec:
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
|
||||
@@ -249,7 +249,7 @@ spec:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
|
||||
@@ -263,7 +263,7 @@ spec:
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{})
|
||||
@@ -279,7 +279,7 @@ spec:
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||
@@ -292,7 +292,7 @@ spec:
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||
@@ -305,7 +305,7 @@ spec:
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
@@ -318,7 +318,7 @@ spec:
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
@@ -331,7 +331,7 @@ spec:
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
@@ -344,7 +344,7 @@ spec:
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
@@ -359,7 +359,7 @@ spec:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |
|
||||
(
|
||||
@@ -375,7 +375,7 @@ spec:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |
|
||||
(
|
||||
@@ -393,7 +393,7 @@ spec:
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: |
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
@@ -405,7 +405,7 @@ spec:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different semantic versions of Kubernetes components running.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: |
|
||||
count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
@@ -415,7 +415,7 @@ spec:
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
@@ -430,7 +430,7 @@ spec:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
@@ -444,7 +444,7 @@ spec:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
@@ -458,7 +458,7 @@ spec:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
@@ -472,7 +472,7 @@ spec:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
@@ -488,7 +488,7 @@ spec:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
@@ -497,7 +497,7 @@ spec:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
@@ -506,7 +506,7 @@ spec:
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors
|
||||
summary: An aggregated API has reported errors.
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
|
||||
@@ -515,7 +515,7 @@ spec:
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown
|
||||
summary: An aggregated API is down.
|
||||
expr: |
|
||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
|
||||
@@ -525,7 +525,7 @@ spec:
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="apiserver"} == 1)
|
||||
@@ -535,7 +535,7 @@ spec:
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
|
||||
summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
||||
expr: |
|
||||
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
@@ -547,7 +547,7 @@ spec:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: |
|
||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
@@ -557,7 +557,7 @@ spec:
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: |
|
||||
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
@@ -567,7 +567,7 @@ spec:
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |
|
||||
count by(node) (
|
||||
@@ -583,7 +583,7 @@ spec:
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
@@ -593,7 +593,7 @@ spec:
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
@@ -603,7 +603,7 @@ spec:
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||
@@ -613,7 +613,7 @@ spec:
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: |
|
||||
kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
@@ -622,7 +622,7 @@ spec:
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: |
|
||||
kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
@@ -631,7 +631,7 @@ spec:
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: |
|
||||
kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
@@ -640,7 +640,7 @@ spec:
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: |
|
||||
kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
@@ -649,7 +649,7 @@ spec:
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: |
|
||||
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
@@ -659,7 +659,7 @@ spec:
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: |
|
||||
increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
@@ -669,7 +669,7 @@ spec:
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
@@ -681,7 +681,7 @@ spec:
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="kube-scheduler"} == 1)
|
||||
@@ -693,7 +693,7 @@ spec:
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
description: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="kube-controller-manager"} == 1)
|
||||
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
@@ -33,7 +33,7 @@ spec:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
@@ -49,7 +49,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
@@ -63,7 +63,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
@@ -77,7 +77,7 @@ spec:
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
@@ -93,7 +93,7 @@ spec:
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
@@ -109,7 +109,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
@@ -123,7 +123,7 @@ spec:
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
@@ -137,7 +137,7 @@ spec:
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
@@ -147,7 +147,7 @@ spec:
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
@@ -157,7 +157,7 @@ spec:
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
@@ -166,7 +166,7 @@ spec:
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: |
|
||||
node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
@@ -175,7 +175,7 @@ spec:
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
@@ -195,7 +195,7 @@ spec:
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
@@ -207,7 +207,7 @@ spec:
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: |
|
||||
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
|
||||
@@ -217,7 +217,7 @@ spec:
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: |
|
||||
node_md_disks{state="failed"} > 0
|
||||
@@ -226,7 +226,7 @@ spec:
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefiledescriptorlimit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |
|
||||
(
|
||||
@@ -238,7 +238,7 @@ spec:
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefiledescriptorlimit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |
|
||||
(
|
||||
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
@@ -27,7 +27,7 @@ spec:
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||
@@ -37,7 +37,7 @@ spec:
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
@@ -47,7 +47,7 @@ spec:
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
|
||||
@@ -57,7 +57,7 @@ spec:
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: |
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||
@@ -67,7 +67,7 @@ spec:
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: |
|
||||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
|
||||
@@ -77,7 +77,7 @@ spec:
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
|
||||
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
@@ -45,7 +45,7 @@ spec:
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
@@ -61,7 +61,7 @@ spec:
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -73,7 +73,7 @@ spec:
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
@@ -83,7 +83,7 @@ spec:
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||
@@ -93,7 +93,7 @@ spec:
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
(
|
||||
@@ -111,7 +111,7 @@ spec:
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -121,7 +121,7 @@ spec:
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -131,7 +131,7 @@ spec:
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
@@ -151,7 +151,7 @@ spec:
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -168,7 +168,7 @@ spec:
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
@@ -184,7 +184,7 @@ spec:
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -194,7 +194,7 @@ spec:
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -204,7 +204,7 @@ spec:
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -214,7 +214,7 @@ spec:
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuslabellimithit
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@@ -224,7 +224,7 @@ spec:
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetsyncfailure
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: |
|
||||
increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0
|
||||
@@ -234,7 +234,7 @@ spec:
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
||||
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
|
||||
Reference in New Issue
Block a user