docs, manifests: Regenerate files

This commit is contained in:
Lili Cosic
2020-06-19 10:30:50 +02:00
parent c5ecc42244
commit beaba9f4da
11 changed files with 599 additions and 557 deletions

View File

@@ -252,7 +252,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: { _config+:: {
namespace: 'monitoring', namespace: 'monitoring',
}, },
grafanaDashboards+:: { grafana+:: {
dashboards+:: {
'my-dashboard.json': 'my-dashboard.json':
dashboard.new('My Dashboard') dashboard.new('My Dashboard')
.addTemplate( .addTemplate(
@@ -277,6 +278,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
.addTarget(prometheus.target('vector(1)'))) .addTarget(prometheus.target('vector(1)')))
), ),
}, },
},
}; };
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
@@ -298,9 +300,14 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: { _config+:: {
namespace: 'monitoring', namespace: 'monitoring',
}, },
grafanaDashboards+:: { grafanaDashboards+:: { // monitoring-mixin compatibility
'my-dashboard.json': (import 'example-grafana-dashboard.json'), 'my-dashboard.json': (import 'example-grafana-dashboard.json'),
}, },
grafana+:: {
dashboards+:: { // use this method to import your dashboards to Grafana
'my-dashboard.json': (import 'example-grafana-dashboard.json'),
},
},
}; };
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
@@ -319,9 +326,11 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: { _config+:: {
namespace: 'monitoring', namespace: 'monitoring',
}, },
rawGrafanaDashboards+:: { grafana+:: {
rawDashboards+:: {
'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'),
}, },
},
}; };
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@@ -19,4 +19,4 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0

View File

@@ -74,7 +74,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
) )
@@ -95,7 +95,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
) )
@@ -116,7 +116,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
) )
@@ -137,7 +137,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
) )
@@ -158,7 +158,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
) )
@@ -179,7 +179,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
) )
@@ -200,7 +200,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
- -
( (
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
) )
@@ -326,81 +326,6 @@ spec:
labels: labels:
verb: write verb: write
record: apiserver_request:burnrate6h record: apiserver_request:burnrate6h
- expr: |
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- expr: |
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver"}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- expr: | - expr: |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels: labels:
@@ -443,6 +368,153 @@ spec:
labels: labels:
quantile: "0.5" quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- interval: 3m
name: kube-apiserver-availability.rules
rules:
- expr: |
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- expr: |
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- name: k8s.rules - name: k8s.rules
rules: rules:
- expr: | - expr: |
@@ -452,31 +524,31 @@ spec:
sum by (cluster, namespace, pod, container) ( sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info) 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
) )
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr: | - expr: |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info) max by(namespace, pod, node) (kube_pod_info{node!=""})
) )
record: node_namespace_pod_container:container_memory_working_set_bytes record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: | - expr: |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info) max by(namespace, pod, node) (kube_pod_info{node!=""})
) )
record: node_namespace_pod_container:container_memory_rss record: node_namespace_pod_container:container_memory_rss
- expr: | - expr: |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info) max by(namespace, pod, node) (kube_pod_info{node!=""})
) )
record: node_namespace_pod_container:container_memory_cache record: node_namespace_pod_container:container_memory_cache
- expr: | - expr: |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info) max by(namespace, pod, node) (kube_pod_info{node!=""})
) )
record: node_namespace_pod_container:container_memory_swap record: node_namespace_pod_container:container_memory_swap
- expr: | - expr: |
@@ -591,12 +663,12 @@ spec:
- name: node.rules - name: node.rules
rules: rules:
- expr: | - expr: |
sum(min(kube_pod_info) by (cluster, node)) sum(min(kube_pod_info{node!=""}) by (cluster, node))
record: ':kube_pod_info_node_count:' record: ':kube_pod_info_node_count:'
- expr: | - expr: |
topk by(namespace, pod) (1, topk by(namespace, pod) (1,
max by (node, namespace, pod) ( max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
)) ))
record: 'node_namespace_pod:kube_pod_info:' record: 'node_namespace_pod:kube_pod_info:'
- expr: | - expr: |
@@ -849,13 +921,22 @@ spec:
severity: warning severity: warning
- alert: NodeHighNumberConntrackEntriesUsed - alert: NodeHighNumberConntrackEntriesUsed
annotations: annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used' description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit summary: Number of conntrack are getting close to the limit.
expr: | expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels: labels:
severity: warning severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected - alert: NodeClockSkewDetected
annotations: annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
@@ -896,20 +977,26 @@ spec:
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubePodNotReady - alert: KubePodNotReady
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 sum by (namespace, pod) (
max by(namespace, pod) (
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeDeploymentGenerationMismatch - alert: KubeDeploymentGenerationMismatch
annotations: annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
@@ -922,7 +1009,7 @@ spec:
kube_deployment_metadata_generation{job="kube-state-metrics"} kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
@@ -940,7 +1027,7 @@ spec:
) )
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeStatefulSetReplicasMismatch - alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
@@ -958,7 +1045,7 @@ spec:
) )
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeStatefulSetGenerationMismatch - alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
@@ -971,13 +1058,14 @@ spec:
kube_statefulset_metadata_generation{job="kube-state-metrics"} kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut - alert: KubeStatefulSetUpdateNotRolledOut
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out. has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: | expr: |
(
max without (revision) ( max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"} kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless unless
@@ -989,9 +1077,14 @@ spec:
!= !=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"} kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
) )
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
==
0
)
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
@@ -1003,7 +1096,7 @@ spec:
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00 kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
for: 15m for: 15m
labels: labels:
severity: critical severity: warning
- alert: KubeContainerWaiting - alert: KubeContainerWaiting
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
@@ -1254,7 +1347,9 @@ spec:
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m for: 2m
labels: labels:
long: 1h
severity: critical severity: critical
short: 5m
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget message: The API server is burning too much error budget
@@ -1265,7 +1360,9 @@ spec:
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m for: 15m
labels: labels:
long: 6h
severity: critical severity: critical
short: 30m
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget message: The API server is burning too much error budget
@@ -1276,7 +1373,9 @@ spec:
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h for: 1h
labels: labels:
long: 1d
severity: warning severity: warning
short: 2h
- alert: KubeAPIErrorBudgetBurn - alert: KubeAPIErrorBudgetBurn
annotations: annotations:
message: The API server is burning too much error budget message: The API server is burning too much error budget
@@ -1287,7 +1386,9 @@ spec:
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h for: 3h
labels: labels:
long: 3d
severity: warning severity: warning
short: 6h
- name: kubernetes-system-apiserver - name: kubernetes-system-apiserver
rules: rules:
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
@@ -1296,6 +1397,10 @@ spec:
{{ $labels.verb }} {{ $labels.resource }}. {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
and on (verb,resource)
( (
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
> >
@@ -1307,10 +1412,6 @@ spec:
) )
) > on (verb) group_left() ) > on (verb) group_left()
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) 1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
and on (verb,resource)
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@@ -1391,8 +1492,7 @@ spec:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
expr: | expr: |
kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1 (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
for: 2m
labels: labels:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods

View File

@@ -2177,6 +2177,15 @@ spec:
of origin for each alert and metric that is user created. The label of origin for each alert and metric that is user created. The label
value will always be the namespace of the object that is being created. value will always be the namespace of the object that is being created.
type: string type: string
enforcedSampleLimit:
description: EnforcedSampleLimit defines global limit on number of
scraped samples that will be accepted. This overrides any SampleLimit
set per ServiceMonitor or/and PodMonitor. It is meant to be used
by admins to enforce the SampleLimit to keep overall number of samples/series
under the desired limit. Note that if SampleLimit is lower that
value will be taken instead.
format: int64
type: integer
evaluationInterval: evaluationInterval:
description: Interval between consecutive evaluations. description: Interval between consecutive evaluations.
type: string type: string
@@ -3428,6 +3437,27 @@ spec:
instance name. Defaults to the value of `prometheus`. External label instance name. Defaults to the value of `prometheus`. External label
will _not_ be added when value is set to empty string (`""`). will _not_ be added when value is set to empty string (`""`).
type: string type: string
prometheusRulesExcludedFromEnforce:
description: PrometheusRulesExcludedFromEnforce - list of prometheus
rules to be excluded from enforcing of adding namespace labels.
Works only if enforcedNamespaceLabel set to true. Make sure both
ruleNamespace and ruleName are set for each pair
items:
description: PrometheusRuleExcludeConfig enables users to configure
excluded PrometheusRule names and their namespaces to be ignored
while enforcing namespace label for alerts and metrics.
properties:
ruleName:
description: RuleNamespace - name of excluded rule
type: string
ruleNamespace:
description: RuleNamespace - namespace of excluded rule
type: string
required:
- ruleName
- ruleNamespace
type: object
type: array
query: query:
description: QuerySpec defines the query command line flags when starting description: QuerySpec defines the query command line flags when starting
Prometheus. Prometheus.
@@ -4114,6 +4144,10 @@ spec:
scrapeInterval: scrapeInterval:
description: Interval between consecutive scrapes. description: Interval between consecutive scrapes.
type: string type: string
scrapeTimeout:
description: Number of seconds to wait for target to respond before
erroring.
type: string
secrets: secrets:
description: Secrets is a list of Secrets in the same namespace as description: Secrets is a list of Secrets in the same namespace as
the Prometheus object, which shall be mounted into the Prometheus the Prometheus object, which shall be mounted into the Prometheus
@@ -4762,6 +4796,12 @@ spec:
logLevel: logLevel:
description: LogLevel for Thanos sidecar to be configured with. description: LogLevel for Thanos sidecar to be configured with.
type: string type: string
minTime:
description: MinTime for Thanos sidecar to be configured with.
Option can be a constant time in RFC3339 format or time duration
relative to current time, such as -1d or 2h45m. Valid duration
units are ms, s, m, h, d, w, y.
type: string
objectStorageConfig: objectStorageConfig:
description: ObjectStorageConfig configures object storage in description: ObjectStorageConfig configures object storage in
Thanos. Thanos.

View File

@@ -2998,6 +2998,27 @@ spec:
priorityClassName: priorityClassName:
description: Priority class assigned to the Pods description: Priority class assigned to the Pods
type: string type: string
prometheusRulesExcludedFromEnforce:
description: PrometheusRulesExcludedFromEnforce - list of Prometheus
rules to be excluded from enforcing of adding namespace labels.
Works only if enforcedNamespaceLabel set to true. Make sure both
ruleNamespace and ruleName are set for each pair
items:
description: PrometheusRuleExcludeConfig enables users to configure
excluded PrometheusRule names and their namespaces to be ignored
while enforcing namespace label for alerts and metrics.
properties:
ruleName:
description: RuleNamespace - name of excluded rule
type: string
ruleNamespace:
description: RuleNamespace - namespace of excluded rule
type: string
required:
- ruleName
- ruleNamespace
type: object
type: array
queryConfig: queryConfig:
description: Define configuration for connecting to thanos query instances. description: Define configuration for connecting to thanos query instances.
If this is defined, the QueryEndpoints field will be ignored. Maps If this is defined, the QueryEndpoints field will be ignored. Maps

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
rules: rules:
- apiGroups: - apiGroups:

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@@ -18,15 +18,15 @@ spec:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
spec: spec:
containers: containers:
- args: - args:
- --kubelet-service=kube-system/kubelet - --kubelet-service=kube-system/kubelet
- --logtostderr=true - --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.3.0 - --config-reloader-image=jimmidyson/configmap-reload:v0.3.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.39.0 - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.40.0
image: quay.io/coreos/prometheus-operator:v0.39.0 image: quay.io/coreos/prometheus-operator:v0.40.0
name: prometheus-operator name: prometheus-operator
ports: ports:
- containerPort: 8080 - containerPort: 8080

View File

@@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:

View File

@@ -4,6 +4,6 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0 app.kubernetes.io/version: v0.40.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring