Merge pull request #133 from metalmatze/mixin-master
Use kubernetes-mixin's master in kube-prometheus master
This commit is contained in:
@@ -18,7 +18,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "release-0.1"
|
"version": "master"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
"subdir": "jsonnet/kube-prometheus"
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "bdf84bf1865e66d76c027eb2cea7bf776acd18a4"
|
"version": "90b8632fb37be04ff73542218d980ba54c53295b"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "ksonnet",
|
"name": "ksonnet",
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "af494738e1709998696ffbce9296063a20c80692"
|
"version": "d8f135ba007b4ec7ac58be9371042d19e1ae4dea"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@@ -78,7 +78,7 @@
|
|||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "9ff762857712c9c5c2037e6ddd9a692a488224bf"
|
"version": "948e276ca73d3eb09391829d8ac317dbda8c07a1"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -42,6 +42,12 @@ spec:
|
|||||||
- mountPath: /etc/grafana/provisioning/dashboards
|
- mountPath: /etc/grafana/provisioning/dashboards
|
||||||
name: grafana-dashboards
|
name: grafana-dashboards
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/apiserver
|
||||||
|
name: grafana-dashboard-apiserver
|
||||||
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/controller-manager
|
||||||
|
name: grafana-dashboard-controller-manager
|
||||||
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use
|
- mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
name: grafana-dashboard-k8s-cluster-rsrc-use
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@@ -63,6 +69,9 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
|
||||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/kubelet
|
||||||
|
name: grafana-dashboard-kubelet
|
||||||
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/nodes
|
- mountPath: /grafana-dashboard-definitions/0/nodes
|
||||||
name: grafana-dashboard-nodes
|
name: grafana-dashboard-nodes
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@@ -72,6 +81,12 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/pods
|
- mountPath: /grafana-dashboard-definitions/0/pods
|
||||||
name: grafana-dashboard-pods
|
name: grafana-dashboard-pods
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/proxy
|
||||||
|
name: grafana-dashboard-proxy
|
||||||
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/scheduler
|
||||||
|
name: grafana-dashboard-scheduler
|
||||||
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/statefulset
|
- mountPath: /grafana-dashboard-definitions/0/statefulset
|
||||||
name: grafana-dashboard-statefulset
|
name: grafana-dashboard-statefulset
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@@ -90,6 +105,12 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboards
|
name: grafana-dashboards
|
||||||
name: grafana-dashboards
|
name: grafana-dashboards
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-apiserver
|
||||||
|
name: grafana-dashboard-apiserver
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-controller-manager
|
||||||
|
name: grafana-dashboard-controller-manager
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
name: grafana-dashboard-k8s-cluster-rsrc-use
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
name: grafana-dashboard-k8s-cluster-rsrc-use
|
||||||
@@ -111,6 +132,9 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-kubelet
|
||||||
|
name: grafana-dashboard-kubelet
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-nodes
|
name: grafana-dashboard-nodes
|
||||||
name: grafana-dashboard-nodes
|
name: grafana-dashboard-nodes
|
||||||
@@ -120,6 +144,12 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-pods
|
name: grafana-dashboard-pods
|
||||||
name: grafana-dashboard-pods
|
name: grafana-dashboard-pods
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-proxy
|
||||||
|
name: grafana-dashboard-proxy
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-scheduler
|
||||||
|
name: grafana-dashboard-scheduler
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-statefulset
|
name: grafana-dashboard-statefulset
|
||||||
name: grafana-dashboard-statefulset
|
name: grafana-dashboard-statefulset
|
||||||
|
@@ -11,30 +11,44 @@ spec:
|
|||||||
- name: k8s.rules
|
- name: k8s.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
|
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
|
||||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
|
sum by (namespace, pod, container) (
|
||||||
|
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
|
||||||
|
)
|
||||||
|
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||||
|
- expr: |
|
||||||
|
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||||
record: namespace:container_memory_usage_bytes:sum
|
record: namespace:container_memory_usage_bytes:sum
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by (namespace, pod_name, container_name) (
|
sum by (namespace, label_name) (
|
||||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
|
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace, pod)
|
||||||
|
* on (namespace, pod)
|
||||||
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||||
)
|
)
|
||||||
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
|
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by(namespace) (
|
sum by (namespace, label_name) (
|
||||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
sum(container_memory_usage_bytes{job="kubelet",image!="", container!="POD"}) by (pod, namespace)
|
||||||
* on (endpoint, instance, job, namespace, pod, service)
|
* on (namespace, pod)
|
||||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||||
)
|
)
|
||||||
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
record: namespace:container_memory_usage_bytes:sum
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by (namespace) (
|
sum by (namespace, label_name) (
|
||||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||||
* on (endpoint, instance, job, namespace, pod, service)
|
* on (namespace, pod)
|
||||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||||
)
|
)
|
||||||
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
- expr: |
|
||||||
|
sum by (namespace, label_name) (
|
||||||
|
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||||
|
* on (namespace, pod)
|
||||||
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||||
|
)
|
||||||
|
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(
|
sum(
|
||||||
label_replace(
|
label_replace(
|
||||||
@@ -71,67 +85,67 @@ spec:
|
|||||||
- name: kube-scheduler.rules
|
- name: kube-scheduler.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||||
- name: kube-apiserver.rules
|
- name: kube-apiserver.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- expr: |
|
- expr: |
|
||||||
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||||
- name: node.rules
|
- name: node.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: sum(min(kube_pod_info) by (node))
|
- expr: sum(min(kube_pod_info) by (node))
|
||||||
@@ -593,11 +607,11 @@ spec:
|
|||||||
tolerate node failure.
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
|
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
|
||||||
/
|
/
|
||||||
sum(node:node_num_cpu:sum)
|
sum(kube_node_status_allocatable_cpu_cores)
|
||||||
>
|
>
|
||||||
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
|
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -607,13 +621,13 @@ spec:
|
|||||||
tolerate node failure.
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
|
||||||
/
|
/
|
||||||
sum(node_memory_MemTotal_bytes)
|
sum(kube_node_status_allocatable_memory_bytes)
|
||||||
>
|
>
|
||||||
(count(node:node_num_cpu:sum)-1)
|
(count(kube_node_status_allocatable_memory_bytes)-1)
|
||||||
/
|
/
|
||||||
count(node:node_num_cpu:sum)
|
count(kube_node_status_allocatable_memory_bytes)
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -624,7 +638,7 @@ spec:
|
|||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||||
/
|
/
|
||||||
sum(node:node_num_cpu:sum)
|
sum(kube_node_status_allocatable_cpu_cores)
|
||||||
> 1.5
|
> 1.5
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
@@ -636,7 +650,7 @@ spec:
|
|||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||||
/
|
/
|
||||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
|
||||||
> 1.5
|
> 1.5
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
@@ -657,12 +671,11 @@ spec:
|
|||||||
- alert: CPUThrottlingHigh
|
- alert: CPUThrottlingHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
|
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
|
||||||
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
|
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||||
}}.'
|
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||||
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\",
|
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\",
|
||||||
}[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
}[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
||||||
by (container_name, pod_name, namespace)\n > 25 \n"
|
by (container, pod, namespace)\n > 25 \n"
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -769,7 +782,7 @@ spec:
|
|||||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -779,7 +792,7 @@ spec:
|
|||||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -788,9 +801,9 @@ spec:
|
|||||||
message: API server is returning errors for {{ $value }}% of requests.
|
message: API server is returning errors for {{ $value }}% of requests.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -799,9 +812,9 @@ spec:
|
|||||||
message: API server is returning errors for {{ $value }}% of requests.
|
message: API server is returning errors for {{ $value }}% of requests.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@@ -811,9 +824,9 @@ spec:
|
|||||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -823,9 +836,9 @@ spec:
|
|||||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
Reference in New Issue
Block a user