Enable Multi Cluster alerts by default (#2099)
This commit is contained in:
@@ -11,6 +11,7 @@ local defaults = {
|
|||||||
mixin:: {
|
mixin:: {
|
||||||
ruleLabels: {},
|
ruleLabels: {},
|
||||||
_config: {
|
_config: {
|
||||||
|
showMultiCluster: true,
|
||||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
summary: 'One or more targets are unreachable.',
|
summary: 'One or more targets are unreachable.',
|
||||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
|
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
|
||||||
},
|
},
|
||||||
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
|
expr: '100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10',
|
||||||
'for': '10m',
|
'for': '10m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ local defaults = {
|
|||||||
prometheus: defaults.name,
|
prometheus: defaults.name,
|
||||||
},
|
},
|
||||||
_config: {
|
_config: {
|
||||||
|
groupLabels: 'cluster,controller,namespace',
|
||||||
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
|
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
|
||||||
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s',
|
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s',
|
||||||
},
|
},
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -83,6 +83,9 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-multicluster
|
||||||
|
name: grafana-dashboard-k8s-resources-multicluster
|
||||||
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace
|
||||||
name: grafana-dashboard-k8s-resources-namespace
|
name: grafana-dashboard-k8s-resources-namespace
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@@ -180,6 +183,9 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-k8s-resources-multicluster
|
||||||
|
name: grafana-dashboard-k8s-resources-multicluster
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-namespace
|
name: grafana-dashboard-k8s-resources-namespace
|
||||||
name: grafana-dashboard-k8s-resources-namespace
|
name: grafana-dashboard-k8s-resources-namespace
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ spec:
|
|||||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||||
summary: One or more targets are unreachable.
|
summary: One or more targets are unreachable.
|
||||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|||||||
@@ -247,50 +247,50 @@ spec:
|
|||||||
rules:
|
rules:
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
|
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||||
and
|
and
|
||||||
(sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
|
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryOvercommit
|
- alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
|
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
|
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||||
and
|
and
|
||||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
|
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeCPUQuotaOvercommit
|
- alert: KubeCPUQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}))
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||||
/
|
/
|
||||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
|
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||||
> 1.5
|
> 1.5
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryQuotaOvercommit
|
- alert: KubeMemoryQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster has overcommitted memory resource requests for Namespaces.
|
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}))
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||||
/
|
/
|
||||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
|
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||||
> 1.5
|
> 1.5
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ spec:
|
|||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
|
||||||
summary: Errors while performing list operations in controller.
|
summary: Errors while performing list operations in controller.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
(sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -30,7 +30,7 @@ spec:
|
|||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
|
||||||
summary: Errors while performing watch operations in controller.
|
summary: Errors while performing watch operations in controller.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
|
(sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -50,7 +50,7 @@ spec:
|
|||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
|
||||||
summary: Errors while reconciling controller.
|
summary: Errors while reconciling controller.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
|
(sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -70,7 +70,7 @@ spec:
|
|||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
|
||||||
summary: Prometheus operator not ready
|
summary: Prometheus operator not ready
|
||||||
expr: |
|
expr: |
|
||||||
min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
|
min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|||||||
Reference in New Issue
Block a user