Compare commits

...

24 Commits

Author SHA1 Message Date
Frederic Branczyk
a95e9dada4 Merge pull request #972 from zzzpoint/release-0.3-fix
Use etcd release-3.4 instead of master to fix broken dependency
2021-02-23 17:35:04 +01:00
Kriuchkov
6fa8bfae2e Use etcd release-3.4 instead of master to fix broken dependency 2021-02-23 10:47:18 -05:00
redwarn
53f18a4276 modify kube-prometheus-static-etcd.libsonnet servicemonitorEtcd namespace (#592)
* Change the servicemonitorEtcd namespace kube-system to $._config.namespace

* delete  Watchdog of alert rule

* Revert "delete  Watchdog of alert rule"

This reverts commit 815b922ead.

Co-authored-by: tyger <tyger.cheng@oriente.com>
2020-06-30 09:15:19 +02:00
Frederic Branczyk
058439f108 Merge pull request #581 from simonpasquier/backport-576-to-release-0.3
Backport #576 to release 0.3
2020-06-23 16:10:57 +02:00
Simon Pasquier
e6529d950f manifests: regenerate 2020-06-22 17:17:10 +02:00
Simon Pasquier
ddd3fd7fe5 Fix AlertmanagerConfigInconsistent alert
Previously the alert would fire when the number of Alertmanager pods
didn't match the number of replicas defined in the Alertmanager spec
even though all the running pods had the same configuration hash. This
type of issue is already covered by KubeStatefulSetUpdateNotRolledOut
(and possibly KubePodNotReady), having AlertmanagerConfigInconsistent
also active in this situation creates unnecessary noise.

With this change, the alert expression only returns when Alertmanager
pods have different configuration hash values irrespective of the number
of pod replicas. The message annotation has also been enhanced to report
the configuration hash for each pod.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2020-06-22 16:59:24 +02:00
Lili Cosic
ca9bf08a8a Merge pull request #549 from dgrisonnet/pin-kubernetes-mixin-0.3
Pin kubernetes-mixin version in release-0.3
2020-05-27 09:58:22 +02:00
Damien Grisonnet
4dd632afc0 jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-05-26 18:21:38 +02:00
Frederic Branczyk
e964410209 Merge pull request #513 from omerlh/cherry-pick
Allow to configure EKS available IPs alert
2020-04-27 15:49:25 +02:00
Omer Levi Hevroni
326eecf9af Allow to configure EKS available IPs alert 2020-04-27 08:41:30 +03:00
Lili Cosic
a98d4be60e Merge pull request #475 from dgrisonnet/ci-test-compat
ci: update release-0.3 e2e tests according to compat matrix
2020-04-01 19:01:58 +02:00
Damien Grisonnet
3101c620c0 test: increase pod polling time
The original polling time was a bit short for all pods to be up which made e2e
tests fail half of the time.

Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 18:24:10 +02:00
Damien Grisonnet
f3a47b9bba ci: update e2e tests according to compat matrix
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 15:47:57 +02:00
Lili Cosic
059149d37c Merge pull request #445 from dgrisonnet/backport-podmonitor
Backport podmonitor to release-0.3
2020-03-10 15:43:12 +01:00
Jonathan Amiez
1ade732468 Enable PodMonitors discovery across namespaces 2020-03-10 15:22:16 +01:00
Jonathan Amiez
f62ba1e136 Update generated manifests 2020-03-10 15:22:03 +01:00
Sergiusz Urbaniak
ed71719c8e Merge pull request #419 from s-urbaniak/count-0.3
[backport] jsonnet: add general rules for up/down targets
2020-02-20 10:00:50 +01:00
Sergiusz Urbaniak
8fd8248928 Makefile: pin jsonnet-ci to 0.36 2020-02-20 09:04:19 +01:00
Sergiusz Urbaniak
d1b81cc1ac manifests: regenerate 2020-02-19 09:15:04 +01:00
Sergiusz Urbaniak
6a19c05248 jsonnet: add general rules for up/down targets 2020-02-19 09:13:23 +01:00
Sergiusz Urbaniak
989c6813aa Merge pull request #366 from paulfantom/backport_ipv6
Backport ipv6 compatibility (#326)
2020-01-08 15:21:52 +01:00
paulfantom
5e5d1297f4 manifests: regenerate 2020-01-08 14:45:55 +01:00
paulfantom
7a94c41e08 jsonnet/kube-prometheus/node-exporter: fix typo 2020-01-08 14:43:39 +01:00
paulfantom
6b3cb71ab2 jsonnet/kube-prometheus/node-exporter: wrap pod ip address in square brackets for ipv6 compatibility reasons 2020-01-08 14:30:18 +01:00
16 changed files with 118 additions and 43 deletions

View File

@@ -15,7 +15,7 @@ CONTAINER_CMD:=docker run --rm \
-v "$(shell go env GOCACHE):/.cache/go-build" \ -v "$(shell go env GOCACHE):/.cache/go-build" \
-v "$(PWD):/go/src/github.com/coreos/kube-prometheus:Z" \ -v "$(PWD):/go/src/github.com/coreos/kube-prometheus:Z" \
-w "/go/src/github.com/coreos/kube-prometheus" \ -w "/go/src/github.com/coreos/kube-prometheus" \
quay.io/coreos/jsonnet-ci quay.io/coreos/jsonnet-ci:release-0.36
all: generate fmt test all: generate fmt test

View File

@@ -7,10 +7,15 @@
{ {
alert: 'AlertmanagerConfigInconsistent', alert: 'AlertmanagerConfigInconsistent',
annotations: { annotations: {
message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', message: |||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
|||,
}, },
expr: ||| expr: |||
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
||| % $._config, ||| % $._config,
'for': '5m', 'for': '5m',
labels: { labels: {

View File

@@ -18,7 +18,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "master" "version": "release-0.2"
}, },
{ {
"name": "grafana", "name": "grafana",
@@ -48,7 +48,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "master" "version": "release-3.4"
}, },
{ {
"name": "prometheus", "name": "prometheus",

View File

@@ -3,6 +3,12 @@ local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType; local servicePort = k.core.v1.service.mixin.spec.portsType;
{ {
_config+:: {
eks: {
minimumAvailableIPs: 10,
minimumAvailableIPsTime: '10m'
}
},
prometheus+: { prometheus+: {
AwsEksCniMetricService: AwsEksCniMetricService:
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
@@ -48,14 +54,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
name: 'kube-prometheus-eks.rules', name: 'kube-prometheus-eks.rules',
rules: [ rules: [
{ {
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
labels: { labels: {
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.' message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
}, },
'for': '10m', 'for': $._config.eks.minimumAvailableIPsTime,
alert: 'EksAvailableIPs' alert: 'EksAvailableIPs'
}, },
], ],

View File

@@ -50,7 +50,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
kind: 'ServiceMonitor', kind: 'ServiceMonitor',
metadata: { metadata: {
name: 'etcd', name: 'etcd',
namespace: 'kube-system', namespace: $._config.namespace,
labels: { labels: {
'k8s-app': 'etcd', 'k8s-app': 'etcd',
}, },

View File

@@ -105,7 +105,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) +
container.withArgs([ container.withArgs([
'--logtostderr', '--logtostderr',
'--secure-listen-address=$(IP):' + $._config.nodeExporter.port, '--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port,
'--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites),
'--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/',
]) + ]) +

View File

@@ -182,6 +182,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
serviceMonitorSelector: {}, serviceMonitorSelector: {},
podMonitorSelector: {}, podMonitorSelector: {},
serviceMonitorNamespaceSelector: {}, serviceMonitorNamespaceSelector: {},
podMonitorNamespaceSelector: {},
nodeSelector: { 'kubernetes.io/os': 'linux' }, nodeSelector: { 'kubernetes.io/os': 'linux' },
ruleSelector: selector.withMatchLabels({ ruleSelector: selector.withMatchLabels({
role: 'alert-rules', role: 'alert-rules',

View File

@@ -0,0 +1,19 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-general.rules',
rules: [
{
expr: 'count without(instance, pod, node) (up == 1)',
record: 'count:up1',
},
{
expr: 'count without(instance, pod, node) (up == 0)',
record: 'count:up0',
},
],
},
],
},
}

View File

@@ -1 +1,2 @@
(import 'node-rules.libsonnet') (import 'node-rules.libsonnet') +
(import 'general.libsonnet')

View File

@@ -72,8 +72,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", "version": "a132ade95740f9364e477ae8e730eabd650d14cb",
"sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU=" "sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
}, },
{ {
"name": "node-mixin", "name": "node-mixin",

View File

@@ -269,7 +269,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))", "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\"}[5m])) by (verb, le))",
"format": "time_series", "format": "time_series",
"intervalFactor": 2, "intervalFactor": 2,
"legendFormat": "{{verb}}", "legendFormat": "{{verb}}",
@@ -22649,7 +22649,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,
@@ -22657,7 +22657,7 @@ items:
"refId": "A" "refId": "A"
}, },
{ {
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,
@@ -22665,7 +22665,7 @@ items:
"refId": "B" "refId": "B"
}, },
{ {
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])", "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series", "format": "time_series",
"interval": "1m", "interval": "1m",
"intervalFactor": 2, "intervalFactor": 2,
@@ -24915,7 +24915,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "sum by(container) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})", "expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
"format": "time_series", "format": "time_series",
"intervalFactor": 2, "intervalFactor": 2,
"legendFormat": "Current: {{ container }}", "legendFormat": "Current: {{ container }}",

View File

@@ -44,7 +44,7 @@ spec:
readOnly: true readOnly: true
- args: - args:
- --logtostderr - --logtostderr
- --secure-listen-address=$(IP):9100 - --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:9100/ - --upstream=http://127.0.0.1:9100/
env: env:

View File

@@ -14,6 +14,7 @@ spec:
baseImage: quay.io/prometheus/prometheus baseImage: quay.io/prometheus/prometheus
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
podMonitorNamespaceSelector: {}
podMonitorSelector: {} podMonitorSelector: {}
replicas: 2 replicas: 2
resources: resources:

View File

@@ -40,10 +40,10 @@ spec:
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m record: instance:node_vmstat_pgmajfault:rate1m
- expr: | - expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m record: instance_device:node_disk_io_time_seconds:rate1m
- expr: | - expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: | - expr: |
sum without (device) ( sum without (device) (
@@ -68,17 +68,22 @@ spec:
- name: kube-apiserver.rules - name: kube-apiserver.rules
rules: rules:
- expr: | - expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
record: cluster:apiserver_request_duration_seconds:mean5m
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.99" quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: | - expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.9" quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: | - expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.5" quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
@@ -251,6 +256,12 @@ spec:
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance, cpu))
record: cluster:node_cpu:ratio record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0
- name: node-exporter - name: node-exporter
rules: rules:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
@@ -425,7 +436,7 @@ spec:
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0 sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@@ -753,12 +764,26 @@ spec:
rules: rules:
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
annotations: annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds message: The API server has an abnormal latency of {{ $value }} seconds for
for {{ $labels.verb }} {{ $labels.resource }}. {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1 (
for: 10m cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
>
on (verb) group_left()
(
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
)
) > on (verb) group_left()
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
and on (verb,resource)
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
for: 5m
labels: labels:
severity: warning severity: warning
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
@@ -767,7 +792,7 @@ spec:
for {{ $labels.verb }} {{ $labels.resource }}. for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4 cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
@@ -1115,10 +1140,13 @@ spec:
rules: rules:
- alert: AlertmanagerConfigInconsistent - alert: AlertmanagerConfigInconsistent
annotations: annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` message: |
are out of sync. The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
expr: | expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical

View File

@@ -87,7 +87,7 @@ func TestQueryPrometheus(t *testing.T) {
} }
// Wait for pod to respond at queries at all. Then start verifying their results. // Wait for pod to respond at queries at all. Then start verifying their results.
err := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) { err := wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) {
_, err := promClient.query("up") _, err := promClient.query("up")
return err == nil, nil return err == nil, nil
}) })

View File

@@ -10,19 +10,33 @@ set -x
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
chmod +x kubectl chmod +x kubectl
curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.4.0/kind-linux-amd64 curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.6.1/kind-linux-amd64
chmod +x kind chmod +x kind
./kind create cluster run_e2e_tests() {
export KUBECONFIG="$(./kind get kubeconfig-path)" cluster_version=$1
# create namespace, permissions, and CRDs ./kind create cluster --image=kindest/node:$cluster_version
./kubectl create -f manifests/setup export KUBECONFIG="$(./kind get kubeconfig-path)"
# wait for CRD creation to complete # create namespace, permissions, and CRDs
until ./kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done ./kubectl create -f manifests/setup
# create monitoring components # wait for CRD creation to complete
./kubectl create -f manifests/ until ./kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done
# create monitoring components
./kubectl create -f manifests/
make test-e2e
./kind delete cluster
}
cluster_compatible_versions=("v1.14.1" "v1.15.0" "v1.16.1" "v1.17.0")
for cluster_version in "${cluster_compatible_versions[@]}"
do
run_e2e_tests $cluster_version
done
make test-e2e