@@ -16657,7 +16657,7 @@ items:
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
@@ -16741,7 +16741,7 @@ items:
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
@@ -26924,7 +26924,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n",
|
||||
"expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} != 0)\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
|
||||
@@ -27017,7 +27017,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n",
|
||||
"expr": "clamp_min(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n, 0)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
|
||||
|
||||
@@ -13,7 +13,7 @@ spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
checksum/grafana-dashboards: b44634653e3bb90dacd5c15f42200fae
|
||||
checksum/grafana-dashboards: ce13f0b50d04c73fb01da858eb1fb608
|
||||
checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a
|
||||
labels:
|
||||
app: grafana
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
@@ -19,4 +19,4 @@ spec:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
|
||||
@@ -1902,21 +1902,6 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
|
||||
@@ -1951,7 +1936,15 @@ spec:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -2001,7 +1994,7 @@ spec:
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
- on(job, instance) group_right
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
> 120
|
||||
@@ -2050,6 +2043,21 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
rules:
|
||||
- apiGroups:
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
@@ -18,13 +18,13 @@ spec:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --kubelet-service=kube-system/kubelet
|
||||
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.44.0
|
||||
image: quay.io/prometheus-operator/prometheus-operator:v0.44.0
|
||||
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.44.1
|
||||
image: quay.io/prometheus-operator/prometheus-operator:v0.44.1
|
||||
name: prometheus-operator
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
|
||||
@@ -4,6 +4,6 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/name: prometheus-operator
|
||||
app.kubernetes.io/version: v0.44.0
|
||||
app.kubernetes.io/version: v0.44.1
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
||||
|
||||
Reference in New Issue
Block a user