Compare commits

...

24 Commits

Author SHA1 Message Date
Frederic Branczyk
a95e9dada4 Merge pull request #972 from zzzpoint/release-0.3-fix
Use etcd release-3.4 instead of master to fix broken dependency
2021-02-23 17:35:04 +01:00
Kriuchkov
6fa8bfae2e Use etcd release-3.4 instead of master to fix broken dependency 2021-02-23 10:47:18 -05:00
redwarn
53f18a4276 modify kube-prometheus-static-etcd.libsonnet servicemonitorEtcd namespace (#592)
* Change the servicemonitorEtcd namespace kube-system to $._config.namespace

* delete  Watchdog of alert rule

* Revert "delete  Watchdog of alert rule"

This reverts commit 815b922ead.

Co-authored-by: tyger <tyger.cheng@oriente.com>
2020-06-30 09:15:19 +02:00
Frederic Branczyk
058439f108 Merge pull request #581 from simonpasquier/backport-576-to-release-0.3
Backport #576 to release 0.3
2020-06-23 16:10:57 +02:00
Simon Pasquier
e6529d950f manifests: regenerate 2020-06-22 17:17:10 +02:00
Simon Pasquier
ddd3fd7fe5 Fix AlertmanagerConfigInconsistent alert
Previously the alert would fire when the number of Alertmanager pods
didn't match the number of replicas defined in the Alertmanager spec
even though all the running pods had the same configuration hash. This
type of issue is already covered by KubeStatefulSetUpdateNotRolledOut
(and possibly KubePodNotReady), having AlertmanagerConfigInconsistent
also active in this situation creates unnecessary noise.

With this change, the alert expression only returns when Alertmanager
pods have different configuration hash values irrespective of the number
of pod replicas. The message annotation has also been enhanced to report
the configuration hash for each pod.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2020-06-22 16:59:24 +02:00
Lili Cosic
ca9bf08a8a Merge pull request #549 from dgrisonnet/pin-kubernetes-mixin-0.3
Pin kubernetes-mixin version in release-0.3
2020-05-27 09:58:22 +02:00
Damien Grisonnet
4dd632afc0 jsonnet: pin kubernetes-mixin version
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-05-26 18:21:38 +02:00
Frederic Branczyk
e964410209 Merge pull request #513 from omerlh/cherry-pick
Allow to configure EKS available IPs alert
2020-04-27 15:49:25 +02:00
Omer Levi Hevroni
326eecf9af Allow to configure EKS available IPs alert 2020-04-27 08:41:30 +03:00
Lili Cosic
a98d4be60e Merge pull request #475 from dgrisonnet/ci-test-compat
ci: update release-0.3 e2e tests according to compat matrix
2020-04-01 19:01:58 +02:00
Damien Grisonnet
3101c620c0 test: increase pod polling time
The original polling time was a bit short for all pods to be up which made e2e
tests fail half of the time.

Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 18:24:10 +02:00
Damien Grisonnet
f3a47b9bba ci: update e2e tests according to compat matrix
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 15:47:57 +02:00
Lili Cosic
059149d37c Merge pull request #445 from dgrisonnet/backport-podmonitor
Backport podmonitor to release-0.3
2020-03-10 15:43:12 +01:00
Jonathan Amiez
1ade732468 Enable PodMonitors discovery across namespaces 2020-03-10 15:22:16 +01:00
Jonathan Amiez
f62ba1e136 Update generated manifests 2020-03-10 15:22:03 +01:00
Sergiusz Urbaniak
ed71719c8e Merge pull request #419 from s-urbaniak/count-0.3
[backport] jsonnet: add general rules for up/down targets
2020-02-20 10:00:50 +01:00
Sergiusz Urbaniak
8fd8248928 Makefile: pin jsonnet-ci to 0.36 2020-02-20 09:04:19 +01:00
Sergiusz Urbaniak
d1b81cc1ac manifests: regenerate 2020-02-19 09:15:04 +01:00
Sergiusz Urbaniak
6a19c05248 jsonnet: add general rules for up/down targets 2020-02-19 09:13:23 +01:00
Sergiusz Urbaniak
989c6813aa Merge pull request #366 from paulfantom/backport_ipv6
Backport ipv6 compatibility (#326)
2020-01-08 15:21:52 +01:00
paulfantom
5e5d1297f4 manifests: regenerate 2020-01-08 14:45:55 +01:00
paulfantom
7a94c41e08 jsonnet/kube-prometheus/node-exporter: fix typo 2020-01-08 14:43:39 +01:00
paulfantom
6b3cb71ab2 jsonnet/kube-prometheus/node-exporter: wrap pod ip address in square brackets for ipv6 compatibility reasons 2020-01-08 14:30:18 +01:00
16 changed files with 118 additions and 43 deletions

View File

@@ -15,7 +15,7 @@ CONTAINER_CMD:=docker run --rm \
-v "$(shell go env GOCACHE):/.cache/go-build" \
-v "$(PWD):/go/src/github.com/coreos/kube-prometheus:Z" \
-w "/go/src/github.com/coreos/kube-prometheus" \
quay.io/coreos/jsonnet-ci
quay.io/coreos/jsonnet-ci:release-0.36
all: generate fmt test

View File

@@ -7,10 +7,15 @@
{
alert: 'AlertmanagerConfigInconsistent',
annotations: {
message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
message: |||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
|||,
},
expr: |||
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
||| % $._config,
'for': '5m',
labels: {

View File

@@ -18,7 +18,7 @@
"subdir": ""
}
},
"version": "master"
"version": "release-0.2"
},
{
"name": "grafana",
@@ -48,7 +48,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "master"
"version": "release-3.4"
},
{
"name": "prometheus",

View File

@@ -3,6 +3,12 @@ local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
eks: {
minimumAvailableIPs: 10,
minimumAvailableIPsTime: '10m'
}
},
prometheus+: {
AwsEksCniMetricService:
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
@@ -48,14 +54,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
name: 'kube-prometheus-eks.rules',
rules: [
{
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
labels: {
severity: 'critical',
},
annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
},
'for': '10m',
'for': $._config.eks.minimumAvailableIPsTime,
alert: 'EksAvailableIPs'
},
],

View File

@@ -50,7 +50,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
kind: 'ServiceMonitor',
metadata: {
name: 'etcd',
namespace: 'kube-system',
namespace: $._config.namespace,
labels: {
'k8s-app': 'etcd',
},

View File

@@ -105,7 +105,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) +
container.withArgs([
'--logtostderr',
'--secure-listen-address=$(IP):' + $._config.nodeExporter.port,
'--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port,
'--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites),
'--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/',
]) +

View File

@@ -182,6 +182,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
serviceMonitorSelector: {},
podMonitorSelector: {},
serviceMonitorNamespaceSelector: {},
podMonitorNamespaceSelector: {},
nodeSelector: { 'kubernetes.io/os': 'linux' },
ruleSelector: selector.withMatchLabels({
role: 'alert-rules',

View File

@@ -0,0 +1,19 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-general.rules',
rules: [
{
expr: 'count without(instance, pod, node) (up == 1)',
record: 'count:up1',
},
{
expr: 'count without(instance, pod, node) (up == 0)',
record: 'count:up0',
},
],
},
],
},
}

View File

@@ -1 +1,2 @@
(import 'node-rules.libsonnet')
(import 'node-rules.libsonnet') +
(import 'general.libsonnet')

View File

@@ -72,8 +72,8 @@
"subdir": ""
}
},
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
"sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU="
"version": "a132ade95740f9364e477ae8e730eabd650d14cb",
"sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
},
{
"name": "node-mixin",

View File

@@ -269,7 +269,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))",
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\"}[5m])) by (verb, le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{verb}}",
@@ -22649,7 +22649,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
"expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -22657,7 +22657,7 @@ items:
"refId": "A"
},
{
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
"expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -22665,7 +22665,7 @@ items:
"refId": "B"
},
{
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+\"}[$__interval])",
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -24915,7 +24915,7 @@ items:
"steppedLine": false,
"targets": [
{
"expr": "sum by(container) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
"expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Current: {{ container }}",

View File

@@ -44,7 +44,7 @@ spec:
readOnly: true
- args:
- --logtostderr
- --secure-listen-address=$(IP):9100
- --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:9100/
env:

View File

@@ -14,6 +14,7 @@ spec:
baseImage: quay.io/prometheus/prometheus
nodeSelector:
kubernetes.io/os: linux
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
replicas: 2
resources:

View File

@@ -40,10 +40,10 @@ spec:
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
@@ -68,17 +68,22 @@ spec:
- name: kube-apiserver.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
record: cluster:apiserver_request_duration_seconds:mean5m
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
@@ -251,6 +256,12 @@ spec:
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
@@ -425,7 +436,7 @@ spec:
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 15m
labels:
severity: critical
@@ -753,12 +764,26 @@ spec:
rules:
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
message: The API server has an abnormal latency of {{ $value }} seconds for
{{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
for: 10m
(
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
>
on (verb) group_left()
(
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
)
) > on (verb) group_left()
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
and on (verb,resource)
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
for: 5m
labels:
severity: warning
- alert: KubeAPILatencyHigh
@@ -767,7 +792,7 @@ spec:
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
for: 10m
labels:
severity: critical
@@ -1115,10 +1140,13 @@ spec:
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
message: |
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1
for: 5m
labels:
severity: critical

View File

@@ -87,7 +87,7 @@ func TestQueryPrometheus(t *testing.T) {
}
// Wait for pod to respond at queries at all. Then start verifying their results.
err := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
err := wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) {
_, err := promClient.query("up")
return err == nil, nil
})

View File

@@ -10,10 +10,13 @@ set -x
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
chmod +x kubectl
curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.4.0/kind-linux-amd64
curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.6.1/kind-linux-amd64
chmod +x kind
./kind create cluster
run_e2e_tests() {
cluster_version=$1
./kind create cluster --image=kindest/node:$cluster_version
export KUBECONFIG="$(./kind get kubeconfig-path)"
# create namespace, permissions, and CRDs
@@ -26,3 +29,14 @@ until ./kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""
./kubectl create -f manifests/
make test-e2e
./kind delete cluster
}
cluster_compatible_versions=("v1.14.1" "v1.15.0" "v1.16.1" "v1.17.0")
for cluster_version in "${cluster_compatible_versions[@]}"
do
run_e2e_tests $cluster_version
done