diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 54db1580..1406fa3c 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2030,6 +2030,9 @@ items: "id": 5, "lines": true, "linewidth": 1, + "links": [ + + ], "minSpan": 24, "nullPointMode": "null as zero", "renderer": "flot", @@ -19273,6 +19276,9 @@ items: "id": 5, "lines": true, "linewidth": 1, + "links": [ + + ], "minSpan": 24, "nullPointMode": "null as zero", "renderer": "flot", @@ -20644,6 +20650,9 @@ items: "id": 5, "lines": true, "linewidth": 1, + "links": [ + + ], "minSpan": 24, "nullPointMode": "null as zero", "renderer": "flot", diff --git a/manifests/prometheus-operator-serviceMonitor.yaml b/manifests/prometheus-operator-serviceMonitor.yaml index a8b977de..52ee0e31 100644 --- a/manifests/prometheus-operator-serviceMonitor.yaml +++ b/manifests/prometheus-operator-serviceMonitor.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: @@ -19,4 +19,4 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 0c1eb32d..37aa9f0a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1020,6 +1020,28 @@ spec: for: 10m labels: severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is + in degraded state due to one or more disks failures. Number of spare drives + is insufficient to fix issue automatically. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. + Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="fail"} > 0 + labels: + severity: warning - name: kubernetes-apps rules: - alert: KubePodCrashLooping @@ -1301,6 +1323,20 @@ spec: for: 5m labels: severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info - alert: KubeQuotaFullyUsed annotations: message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage @@ -1310,10 +1346,24 @@ spec: kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - >= 1 + == 1 for: 15m labels: severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning - alert: CPUThrottlingHigh annotations: message: '{{ $value | humanizePercentage }} throttling of CPU in namespace diff --git a/manifests/setup/prometheus-operator-clusterRole.yaml b/manifests/setup/prometheus-operator-clusterRole.yaml index a3de43b7..ed895874 100644 --- a/manifests/setup/prometheus-operator-clusterRole.yaml +++ b/manifests/setup/prometheus-operator-clusterRole.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator rules: - apiGroups: diff --git a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml index 9001430c..38e98265 100644 --- a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml +++ b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/setup/prometheus-operator-deployment.yaml b/manifests/setup/prometheus-operator-deployment.yaml index 56de8d59..08cd3a77 100644 --- a/manifests/setup/prometheus-operator-deployment.yaml +++ b/manifests/setup/prometheus-operator-deployment.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: @@ -18,15 +18,15 @@ spec: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 spec: containers: - args: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=jimmidyson/configmap-reload:v0.4.0 - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0 - image: quay.io/prometheus-operator/prometheus-operator:v0.42.0 + - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.1 + image: quay.io/prometheus-operator/prometheus-operator:v0.42.1 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/setup/prometheus-operator-service.yaml b/manifests/setup/prometheus-operator-service.yaml index 91d1bbe3..a1543b11 100644 --- a/manifests/setup/prometheus-operator-service.yaml +++ b/manifests/setup/prometheus-operator-service.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: diff --git a/manifests/setup/prometheus-operator-serviceAccount.yaml b/manifests/setup/prometheus-operator-serviceAccount.yaml index 2a98d4db..37f53feb 100644 --- a/manifests/setup/prometheus-operator-serviceAccount.yaml +++ b/manifests/setup/prometheus-operator-serviceAccount.yaml @@ -4,6 +4,6 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring