Merge pull request #574 from lilic/bump-prom-op-40

Bump prometheus-operator to v0.40
This commit is contained in:
Lili Cosic
2020-06-19 11:55:50 +02:00
committed by GitHub
13 changed files with 629 additions and 587 deletions

View File

@@ -252,30 +252,32 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
grafanaDashboards+:: {
'my-dashboard.json':
dashboard.new('My Dashboard')
.addTemplate(
{
current: {
text: 'Prometheus',
value: 'Prometheus',
grafana+:: {
dashboards+:: {
'my-dashboard.json':
dashboard.new('My Dashboard')
.addTemplate(
{
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addRow(
row.new()
.addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource')
.addTarget(prometheus.target('vector(1)')))
),
)
.addRow(
row.new()
.addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource')
.addTarget(prometheus.target('vector(1)')))
),
},
},
};
@@ -298,9 +300,14 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
grafanaDashboards+:: {
grafanaDashboards+:: { // monitoring-mixin compatibility
'my-dashboard.json': (import 'example-grafana-dashboard.json'),
},
grafana+:: {
dashboards+:: { // use this method to import your dashboards to Grafana
'my-dashboard.json': (import 'example-grafana-dashboard.json'),
},
},
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
@@ -319,8 +326,10 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
rawGrafanaDashboards+:: {
'my-dashboard.json': (importstr 'example-grafana-dashboard.json'),
grafana+:: {
rawDashboards+:: {
'my-dashboard.json': (importstr 'example-grafana-dashboard.json'),
},
},
};

View File

@@ -26,7 +26,7 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "release-0.39"
"version": "release-0.40"
},
{
"source": {

View File

@@ -4,7 +4,7 @@
{
"source": {
"git": {
"remote": "https://github.com/brancz/kubernetes-grafana",
"remote": "https://github.com/brancz/kubernetes-grafana.git",
"subdir": "grafana"
}
},
@@ -14,47 +14,47 @@
{
"source": {
"git": {
"remote": "https://github.com/coreos/etcd",
"remote": "https://github.com/coreos/etcd.git",
"subdir": "Documentation/etcd-mixin"
}
},
"version": "1166b1f195efae31439c7b3c913b4ef02e7df889",
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
"version": "d8c8f903eee10b8391abaef7758c38b2cd393c55",
"sum": "pk7mLpdUrHuJKkj2vhD6LGMU7P+oYYooBXAeZyZa398="
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"remote": "https://github.com/coreos/prometheus-operator.git",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "d0a871b710de7b764c05ced98dbd1eb32a681790",
"sum": "cIOKRTNBUOl3a+QsaA/NjClmZAhyVJHlDFReKlXJBAs="
"version": "e31c69f9b5c6555e0f4a5c1f39d0f03182dd6b41",
"sum": "WggWVWZ+CBEUThQCztSaRELbtqdXf9s3OFzf06HbYNA="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib",
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet"
}
},
"version": "906768d46973e022594d3f03d82c5a51d86de2cc",
"sum": "J3Vp0EVbxTObr6KydLXsi4Rc0ssNVAEuwLc0NQ+4wqU="
"version": "8fb95bd89990e493a8534205ee636bfcb8db67bd",
"sum": "tDuuSKE9f4Ew2bjBM33Rs6behLEAzkmKkShSt+jpAak="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"remote": "https://github.com/grafana/jsonnet-libs.git",
"subdir": "grafana-builder"
}
},
"version": "cb6bc2780a39afbbf9d4ee64fec8d1152023aee9",
"version": "881db2241f0c5007c3e831caf34b0c645202b4ab",
"sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
},
{
"source": {
"git": {
"remote": "https://github.com/ksonnet/ksonnet-lib",
"remote": "https://github.com/ksonnet/ksonnet-lib.git",
"subdir": ""
}
},
@@ -65,62 +65,62 @@
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
"subdir": ""
}
},
"version": "3cc34f995c31ed6e1e92024fed1912d63569c39f",
"sum": "r5Fg4KgiBtsFPCCHtM3Cb4CEgnizLyK97srDNAcjr+Y="
"version": "b61c5a34051f8f57284a08fe78ad8a45b430252b",
"sum": "7Hx/5eNm7ubLTsdrpk3b2+e/FLR3XOa4HCukmbRUCAY="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git",
"subdir": "lib/promgrafonnet"
}
},
"version": "3cc34f995c31ed6e1e92024fed1912d63569c39f",
"version": "b61c5a34051f8f57284a08fe78ad8a45b430252b",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "52fe3a268bd78c8f32a03361e28fdf23c41512c5",
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"remote": "https://github.com/kubernetes/kube-state-metrics.git",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "52fe3a268bd78c8f32a03361e28fdf23c41512c5",
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"remote": "https://github.com/prometheus/node_exporter.git",
"subdir": "docs/node-mixin"
}
},
"version": "d4d2e1db98152ab6c94dc9a12a997950e0be2416",
"sum": "ZwrC0+4o1xD6+oPBu1p+rBXLlf6pMBD9rT8ygyl2aW0="
"version": "08ce3c6dd430deb51798826701a395e460620d60",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/prometheus",
"remote": "https://github.com/prometheus/prometheus.git",
"subdir": "documentation/prometheus-mixin"
}
},
"version": "209d4bb8a1491f4535cc6d991681e7dc03bb1d56",
"sum": "kRb3XBTe/AALDcaTFfyuiKqzhxtLvihBkVkvJ5cUd/I=",
"version": "8d3c2f6829d73be15a6684f9324917e72fbf1a31",
"sum": "lEzhZ8gllSfAO4kmXeTwl4W0anapIeFd5GCaCNuDe18=",
"name": "prometheus"
},
{

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
namespace: monitoring
spec:
@@ -19,4 +19,4 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0

View File

@@ -74,7 +74,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
)
@@ -95,7 +95,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
)
@@ -116,7 +116,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
)
@@ -137,7 +137,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
)
@@ -158,7 +158,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
)
@@ -179,7 +179,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
)
@@ -200,7 +200,7 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
)
@@ -326,81 +326,6 @@ spec:
labels:
verb: write
record: apiserver_request:burnrate6h
- expr: |
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- expr: |
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."})
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver"}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
@@ -443,6 +368,153 @@ spec:
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- interval: 3m
name: kube-apiserver-availability.rules
rules:
- expr: |
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- expr: |
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- name: k8s.rules
rules:
- expr: |
@@ -452,31 +524,31 @@ spec:
sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info)
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr: |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
- expr: |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
- expr: |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
- expr: |
@@ -591,12 +663,12 @@ spec:
- name: node.rules
rules:
- expr: |
sum(min(kube_pod_info) by (cluster, node))
sum(min(kube_pod_info{node!=""}) by (cluster, node))
record: ':kube_pod_info_node_count:'
- expr: |
topk by(namespace, pod) (1,
max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
@@ -849,13 +921,22 @@ spec:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used'
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
@@ -896,20 +977,26 @@ spec:
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m
labels:
severity: critical
severity: warning
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
sum by (namespace, pod) (
max by(namespace, pod) (
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
@@ -922,7 +1009,7 @@ spec:
kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
@@ -940,7 +1027,7 @@ spec:
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
@@ -958,7 +1045,7 @@ spec:
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
@@ -971,27 +1058,33 @@ spec:
kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
==
0
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
@@ -1003,7 +1096,7 @@ spec:
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
@@ -1254,7 +1347,9 @@ spec:
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
labels:
long: 1h
severity: critical
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
@@ -1265,7 +1360,9 @@ spec:
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
labels:
long: 6h
severity: critical
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
@@ -1276,7 +1373,9 @@ spec:
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
labels:
long: 1d
severity: warning
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
@@ -1287,7 +1386,9 @@ spec:
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
labels:
long: 3d
severity: warning
short: 6h
- name: kubernetes-system-apiserver
rules:
- alert: KubeAPILatencyHigh
@@ -1296,6 +1397,10 @@ spec:
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
and on (verb,resource)
(
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
>
@@ -1307,10 +1412,6 @@ spec:
)
) > on (verb) group_left()
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
and on (verb,resource)
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
for: 5m
labels:
severity: warning
@@ -1391,8 +1492,7 @@ spec:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
expr: |
kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
for: 2m
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
labels:
severity: warning
- alert: KubeletTooManyPods

View File

@@ -2177,6 +2177,15 @@ spec:
of origin for each alert and metric that is user created. The label
value will always be the namespace of the object that is being created.
type: string
enforcedSampleLimit:
description: EnforcedSampleLimit defines global limit on number of
scraped samples that will be accepted. This overrides any SampleLimit
set per ServiceMonitor or/and PodMonitor. It is meant to be used
by admins to enforce the SampleLimit to keep overall number of samples/series
under the desired limit. Note that if SampleLimit is lower that
value will be taken instead.
format: int64
type: integer
evaluationInterval:
description: Interval between consecutive evaluations.
type: string
@@ -3428,6 +3437,27 @@ spec:
instance name. Defaults to the value of `prometheus`. External label
will _not_ be added when value is set to empty string (`""`).
type: string
prometheusRulesExcludedFromEnforce:
description: PrometheusRulesExcludedFromEnforce - list of prometheus
rules to be excluded from enforcing of adding namespace labels.
Works only if enforcedNamespaceLabel set to true. Make sure both
ruleNamespace and ruleName are set for each pair
items:
description: PrometheusRuleExcludeConfig enables users to configure
excluded PrometheusRule names and their namespaces to be ignored
while enforcing namespace label for alerts and metrics.
properties:
ruleName:
description: RuleNamespace - name of excluded rule
type: string
ruleNamespace:
description: RuleNamespace - namespace of excluded rule
type: string
required:
- ruleName
- ruleNamespace
type: object
type: array
query:
description: QuerySpec defines the query command line flags when starting
Prometheus.
@@ -4114,6 +4144,10 @@ spec:
scrapeInterval:
description: Interval between consecutive scrapes.
type: string
scrapeTimeout:
description: Number of seconds to wait for target to respond before
erroring.
type: string
secrets:
description: Secrets is a list of Secrets in the same namespace as
the Prometheus object, which shall be mounted into the Prometheus
@@ -4762,6 +4796,12 @@ spec:
logLevel:
description: LogLevel for Thanos sidecar to be configured with.
type: string
minTime:
description: MinTime for Thanos sidecar to be configured with.
Option can be a constant time in RFC3339 format or time duration
relative to current time, such as -1d or 2h45m. Valid duration
units are ms, s, m, h, d, w, y.
type: string
objectStorageConfig:
description: ObjectStorageConfig configures object storage in
Thanos.

View File

@@ -2998,6 +2998,27 @@ spec:
priorityClassName:
description: Priority class assigned to the Pods
type: string
prometheusRulesExcludedFromEnforce:
description: PrometheusRulesExcludedFromEnforce - list of Prometheus
rules to be excluded from enforcing of adding namespace labels.
Works only if enforcedNamespaceLabel set to true. Make sure both
ruleNamespace and ruleName are set for each pair
items:
description: PrometheusRuleExcludeConfig enables users to configure
excluded PrometheusRule names and their namespaces to be ignored
while enforcing namespace label for alerts and metrics.
properties:
ruleName:
description: RuleNamespace - name of excluded rule
type: string
ruleNamespace:
description: RuleNamespace - namespace of excluded rule
type: string
required:
- ruleName
- ruleNamespace
type: object
type: array
queryConfig:
description: Define configuration for connecting to thanos query instances.
If this is defined, the QueryEndpoints field will be ignored. Maps

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
rules:
- apiGroups:

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
namespace: monitoring
spec:
@@ -18,15 +18,15 @@ spec:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.3.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.39.0
image: quay.io/coreos/prometheus-operator:v0.39.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.40.0
image: quay.io/coreos/prometheus-operator:v0.40.0
name: prometheus-operator
ports:
- containerPort: 8080

View File

@@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
namespace: monitoring
spec:

View File

@@ -4,6 +4,6 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.39.0
app.kubernetes.io/version: v0.40.0
name: prometheus-operator
namespace: monitoring