Compare commits

..

89 Commits

Author SHA1 Message Date
Lili Cosic
ee8077db04 Merge pull request #476 from dgrisonnet/ci-test-compat-04
ci: update release-0.4 e2e tests according to compat matrix
2020-04-01 19:02:30 +02:00
Damien Grisonnet
d3bee7fa1a test: increase pod polling time
The original polling time was a bit short for all pods to be up which made e2e
tests fail half of the time.

Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 18:26:56 +02:00
Damien Grisonnet
106132ac18 Makefile: pin jsonnet-ci to 0.36
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 16:21:06 +02:00
Damien Grisonnet
8961be9639 ci: update e2e tests according to compat matrix
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
2020-03-31 14:38:20 +02:00
Frederic Branczyk
ae589e91ce Merge pull request #401 from s-urbaniak/up-down-0.4
[backport] jsonnet: add general rules for up/down targets
2020-01-30 16:05:49 +01:00
Sergiusz Urbaniak
8367575768 manifests: regenerate 2020-01-30 14:34:58 +01:00
Sergiusz Urbaniak
6b5033d65e jsonnet: add general rules for up/down targets 2020-01-30 14:29:40 +01:00
Paweł Krupa
68d6e611c6 Fast forward release-0.4 to master (#389)
Fast forward release-0.4 to master
2020-01-23 15:36:04 +01:00
Frederic Branczyk
f2b4528b63 Merge pull request #387 from brancz/reduce-histogram-buckets
*: Throw away unused high cardinality apiserver duration buckets
2020-01-23 15:32:18 +01:00
Krasi Georgiev
be8eb39024 re-added most collectors
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
2020-01-23 15:18:59 +01:00
Krasi Georgiev
629e86e53a remove some unused collectors
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
2020-01-23 15:18:59 +01:00
Frederic Branczyk
a7628e0223 Merge pull request #381 from krasi-georgiev/remove-collectors
remove some unused collectors
2020-01-23 14:50:47 +01:00
Krasi Georgiev
8984606f5d re-added most collectors
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
2020-01-23 15:17:56 +02:00
Frederic Branczyk
48d95f0b9f *: Throw away unused high cardinality apiserver duration buckets 2020-01-23 13:24:42 +01:00
Frederic Branczyk
e410043b6b Merge pull request #386 from paulfantom/bump_kube-mix
Bump kubernetes-mixins
2020-01-23 12:22:40 +01:00
paulfantom
894069f24d manifests: regenerate 2020-01-23 12:01:21 +01:00
paulfantom
d074ea1427 bump kubernetes-mixins dependency 2020-01-23 12:01:10 +01:00
Frederic Branczyk
269aef6e37 Merge pull request #384 from s-urbaniak/agg
prometheus-adapter: add nodes resource to aggregated-metrics-reader
2020-01-22 09:45:38 +01:00
Sergiusz Urbaniak
90e5982de4 manifests: regenerate 2020-01-21 20:43:47 +01:00
Sergiusz Urbaniak
7165938b39 prometheus-adapter: add nodes resource to aggregated-metrics-reader 2020-01-21 18:36:52 +01:00
Frederic Branczyk
9ebe632d5d Merge pull request #380 from omerlh/prom-all-namespaces
added patch to allow prom to watch all namespaces
2020-01-20 14:16:29 +01:00
Lili Cosic
72ae778bfc Merge pull request #382 from tlereste/update_kube_state_metrics
bump kube-state-metrics to version 1.9.2
2020-01-17 11:17:57 +01:00
Thibault Le Reste
0608c96bf6 bump kube-state-metrics to version 1.9.2 2020-01-15 13:12:35 +01:00
Krasi Georgiev
44f3c61010 remove some unused collectors
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
2020-01-15 12:03:04 +02:00
omerlh
f517b35a42 added patch to allow prom to watch all namespaces 2020-01-14 17:55:27 +02:00
Frederic Branczyk
54c0fda307 Merge pull request #378 from LiliC/drop-less
jsonnet,manifests: Do not drop not all metrics
2020-01-14 14:55:54 +01:00
Lili Cosic
6a3d667d3e manifests: Regenerate files 2020-01-14 10:34:46 +01:00
Lili Cosic
d9d3139dc8 jsonnet: Drop exact metrics 2020-01-14 10:26:42 +01:00
Frederic Branczyk
67ed0f63c2 Merge pull request #371 from tlereste/update_kube_state_metrics_version
update kube-state-metrics version to 1.9.1
2020-01-10 14:47:42 +01:00
Thibault Le Reste
7788d0d327 update kube-state-metrics version to 1.9.1 2020-01-10 14:23:52 +01:00
Lili Cosic
fca505f2a2 Merge pull request #368 from jfassad/master
jsonnet/kube-prometheus/kube-state-metrics: Add missing clusterRole permissions
2020-01-10 11:47:45 +01:00
João Assad
d40548d3a0 manifests: Regenerate manifests 2020-01-09 15:24:50 -03:00
João Assad
dba42d3477 jsonnet/kube-prometheus/kube-state-metrics: add missing clusterRole permissions 2020-01-09 15:12:59 -03:00
Lili Cosic
ee37661c34 Merge pull request #367 from LiliC/bump-k8s
tests/e2e/travis-e2e.sh: Switch to 1.17 k8s cluster
2020-01-09 13:13:39 +01:00
Lili Cosic
8b36950f0e tests/e2e/travis-e2e.sh: Switch to 1.17 k8s cluster 2020-01-09 13:03:01 +01:00
Frederic Branczyk
932745172d Merge pull request #365 from LiliC/drop-kubelet
Drop correct deprecated metrics and add e2e test to ensure that
2020-01-08 17:39:26 +01:00
Lili Cosic
1af59f3130 tests/e2e: Add e2e test to make sure all deprecated metrics are being
dropped
2020-01-08 12:35:21 +01:00
Lili Cosic
6562b02da8 manifests/*: Regenerate manifests 2020-01-08 12:35:21 +01:00
Lili Cosic
23999e44df jsonnet/kube-prometheus/prometheus: Drop correct deprecated metrics 2020-01-08 12:35:21 +01:00
Frederic Branczyk
69d3357892 Merge pull request #362 from pgier/lock-version-of-prometheus-operator-jsonnet-dependency
lock prometheus-operator jsonnet dependencies to v0.34.0
2020-01-07 08:06:46 +01:00
Frederic Branczyk
3465b0fa0d Merge pull request #346 from omerlh/patch-1
fix coredns monitoring on EKS
2020-01-06 16:19:16 +01:00
Paul Gier
1d1ce4967f lock prometheus-operator jsonnet dependencies to release-0.34 branch
This prevents mismatch between prometheus-operator binary and related
CRD yaml files.
2020-01-06 09:16:42 -06:00
Frederic Branczyk
3a0e6ba91f Merge pull request #360 from omerlh/patch-2
added metric_path to kublet/cadvisor selector
2020-01-06 13:24:23 +01:00
omerlh
81e2d19398 run make 2020-01-06 13:49:57 +02:00
Omer Levi Hevroni
92d4cbae08 added metric_path to kublet/cadvisor selector 2020-01-06 11:52:48 +02:00
Omer Levi Hevroni
2e72a8a832 fix coredns monitoring on EKS 2019-12-23 12:39:21 +02:00
Lili Cosic
9493a1a5f7 Merge pull request #342 from tlereste/update_kube_state_metrics
update kube-state-metrics version to 1.9.0
2019-12-20 16:57:17 +01:00
Thibault LE RESTE
0a48577bb7 update kube-state-metrics version to 1.9.0 2019-12-20 16:21:52 +01:00
Frederic Branczyk
9211c42df0 Merge pull request #336 from LiliC/change-dropped-metrics
jsonnet/kube-prometheus: Adjust dropped deprecated metrics names
2019-12-19 13:05:37 +01:00
Lili Cosic
5cddfd8da7 manifests: Regenerate manifests 2019-12-19 10:10:46 +01:00
Lili Cosic
bd69007c8c jsonnet/kube-prometheus: Adjust dropped deprecated metrics names
The names were not complete in the kubernetes CHANGELOG.
2019-12-19 10:09:34 +01:00
Frederic Branczyk
4f2b9c1ec8 Merge pull request #332 from LiliC/remove-pin-release
jsonnet/kube-prometheus/jsonnetfile.json: Pin prometheus-operator version to master instead
2019-12-18 13:16:03 +01:00
Lili Cosic
0be63d47fc manifests: Regenerate manifests 2019-12-18 11:18:21 +01:00
Lili Cosic
5fe60f37a2 jsonnetfile.lock.json: Update 2019-12-18 11:18:21 +01:00
Lili Cosic
200fee8d7c jsonnet/kube-prometheus/jsonnetfile.json: Pin prometheus-operator
version to master instead
2019-12-18 11:18:21 +01:00
Frederic Branczyk
1b9be6d00b Merge pull request #330 from LiliC/remove-depr-metrics
jsonnet,manifests: Drop all metrics which are deprecated in kubernetes
2019-12-17 16:51:40 +01:00
Lili Cosic
ce68c4b392 manifests/*: Regenerate manifest 2019-12-17 15:13:04 +01:00
Lili Cosic
5e9b883528 jsonnet/kube-prometheus*: Drop deprecated kubernetes metrics
These metrics were deprecated in kubernetes from 1.14 and 1.15 onwards.
2019-12-17 15:13:04 +01:00
Paweł Krupa
69b0ba03f1 Merge pull request #329 from paulfantom/e2e
tests/e2e: reenable checking targets availability
2019-12-16 14:40:43 +01:00
paulfantom
3279f222a0 tests/e2e: reenable checking targets availability 2019-12-16 14:23:43 +01:00
Paweł Krupa
543ccec970 Fix typo in node-exporter DaemonSet (#328)
Fix typo in node-exporter DaemonSet
2019-12-16 12:56:49 +01:00
paulfantom
f17ddfd293 assets: regenerate 2019-12-16 12:53:49 +01:00
paulfantom
3b8530d742 jsonnet/kube-prometheus/node-exporter: fix typo 2019-12-16 12:53:39 +01:00
Frederic Branczyk
44fe363211 Merge pull request #327 from paulfantom/deps
Update dependencies
2019-12-16 12:14:26 +01:00
paulfantom
326453cf47 manifests: regenerate 2019-12-16 11:24:04 +01:00
paulfantom
159a14ef47 update jsonnet dependencies 2019-12-16 11:20:37 +01:00
Frederic Branczyk
d03d57e6bb Merge pull request #326 from paulfantom/ipv6
IPv6 compatibility
2019-12-16 10:34:51 +01:00
Frederic Branczyk
31cb71fcd9 Merge pull request #317 from josqu4red/podmonitor-default-ns
Enable discovery of Podmonitors across namespaces
2019-12-12 16:54:39 +01:00
paulfantom
4474b24a32 manifests: regenerate 2019-12-12 16:26:58 +01:00
paulfantom
339ade5a81 jsonnet/kube-prometheus/node-exporter: wrap pod ip address in square brackets for ipv6 compatibility reasons 2019-12-12 16:14:08 +01:00
Frederic Branczyk
ce7c5fa3b4 Merge pull request #325 from sereinity-forks/master
Make limits/requests resources of kube-state-metrics removable
2019-12-12 16:06:58 +01:00
Sereinity
3f388b797d Make limits/requests resources of kube-state-metrics removable, unify tunning 2019-12-12 15:50:34 +01:00
Frederic Branczyk
20abdf3b72 Merge pull request #323 from simonpasquier/bump-kubernetes-mixin
Bump kubernetes mixin
2019-12-10 17:05:35 +01:00
Simon Pasquier
cd0f3c641e regenerate
Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2019-12-10 16:48:51 +01:00
Simon Pasquier
408fde189b Bump kubernetes-mixin
Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2019-12-10 16:48:28 +01:00
Jonathan Amiez
90cf0ae21c Update generated manifests 2019-12-05 15:12:18 +01:00
Jonathan Amiez
3ba4b5602a Enable PodMonitors discovery across namespaces 2019-12-05 15:09:40 +01:00
Frederic Branczyk
cb0e6e2c89 Merge pull request #309 from benjaminhuo/master
Group alert by namespace instead of job
2019-12-04 08:38:04 +01:00
Benjamin
03f7adcf92 regenerate
Signed-off-by: Benjamin <benjamin@yunify.com>
2019-12-04 10:14:42 +08:00
Benjamin
fd267aebeb Merge remote-tracking branch 'upstream/master' 2019-12-04 10:09:14 +08:00
Benjamin
420425d88e regenerate
Signed-off-by: Benjamin <benjamin@yunify.com>
2019-12-03 23:46:08 +08:00
Benjamin
965bec0ad7 Change Alertmanager group by condition
Signed-off-by: Benjamin <benjamin@yunify.com>
2019-12-03 20:02:47 +08:00
Frederic Branczyk
d22bad8293 Merge pull request #313 from yeya24/update-apiverison
Update apiversion
2019-12-03 11:22:47 +01:00
Frederic Branczyk
8c255e9e6c Merge pull request #310 from paulfantom/node-exporter-scrape-interval
Change node-exporter scrape interval to follow best practices
2019-12-03 10:15:52 +01:00
yeya24
56027ac757 update apiversion
Signed-off-by: yeya24 <yb532204897@gmail.com>
2019-12-01 09:33:11 -05:00
paulfantom
50b06b0d33 manifests: regenerate 2019-11-27 15:11:06 +01:00
paulfantom
6f6fd65a48 jsonnet/kube-prometheus/node-exporter: follow node-exporter best practices and scrape data every 15s 2019-11-27 15:09:04 +01:00
Frederic Branczyk
f48fe057dc Merge pull request #307 from EricHorst/patch-1
Update README.md with apply clarification.
2019-11-21 17:41:53 -08:00
Eric Horst
8487871388 Update README.md with apply clarification.
Update the kubectl apply commands in the customizing section to match those the quickstart section. The customizing section did not account for the recently introduced setup/ subdirectory.
2019-11-17 21:10:32 -08:00
32 changed files with 1579 additions and 541 deletions

View File

@@ -236,8 +236,13 @@ The previous steps (compilation) has created a bunch of manifest files in the ma
Now simply use `kubectl` to install Prometheus and Grafana as per your configuration:
```shell
# Update the namespace and CRDs, and then wait for them to be availble before creating the remaining resources
$ kubectl apply -f manifests/setup
$ kubectl apply -f manifests/
```
Alternatively, the resources in both folders can be applied with a single command
`kubectl apply -Rf manifests`, but it may be necessary to run the command multiple times for all components to
be created successfullly.
Check the monitoring namespace (or the namespace you have specific in `namespace: `) and make sure the pods are running. Prometheus and Grafana should be up and running soon.

View File

@@ -14,12 +14,16 @@ spec:
port: 8080
targetPort: web
---
apiVersion: extensions/v1beta1
apiVersion: apps/v1
kind: Deployment
metadata:
name: example-app
namespace: default
spec:
selector:
matchLabels:
app: example-app
version: 1.1.3
replicas: 4
template:
metadata:

View File

@@ -14,6 +14,14 @@ rules:
- get
- list
- watch
- apiGroups:
- "apps"
resources:
- deployments
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:

View File

@@ -1,4 +1,4 @@
apiVersion: extensions/v1beta1
apiVersion: apps/v1
kind: Deployment
metadata:
name: metrics-server

View File

@@ -19,7 +19,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
resolve_timeout: '5m',
},
route: {
group_by: ['job'],
group_by: ['namespace'],
group_wait: '30s',
group_interval: '5m',
repeat_interval: '12h',

View File

@@ -7,15 +7,10 @@
{
alert: 'AlertmanagerConfigInconsistent',
annotations: {
message: |||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
|||,
message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
},
expr: |||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
||| % $._config,
'for': '5m',
labels: {

View File

@@ -0,0 +1,50 @@
[
// Drop all kubelet metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)',
action: 'drop',
},
// Drop all scheduler metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)',
action: 'drop',
},
// Drop all apiserver metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)',
action: 'drop',
},
// Drop all docker metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)',
action: 'drop',
},
// Drop all reflector metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)',
action: 'drop',
},
// Drop all etcd metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)',
action: 'drop',
},
// Drop all transformation metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'transformation_(transformation_latencies_microseconds|failures_total)',
action: 'drop',
},
// Drop all other metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)',
action: 'drop',
},
]

View File

@@ -18,7 +18,7 @@
"subdir": ""
}
},
"version": "release-0.2"
"version": "master"
},
{
"name": "grafana",
@@ -48,7 +48,7 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "release-3.4"
"version": "master"
},
{
"name": "prometheus",

View File

@@ -0,0 +1,20 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
prometheus+:: {
clusterRole+: {
rules+:
local role = k.rbac.v1.role;
local policyRule = role.rulesType;
local rule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'services',
'endpoints',
'pods',
]) +
policyRule.withVerbs(['get', 'list', 'watch']);
[rule]
},
}
}

View File

@@ -3,13 +3,18 @@ local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
eks: {
minimumAvailableIPs: 10,
minimumAvailableIPsTime: '10m'
}
},
prometheus+: {
serviceMonitorCoreDNS+: {
spec+: {
endpoints: [
{
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
interval: "15s",
targetPort: 9153
}
]
},
},
AwsEksCniMetricService:
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
service.mixin.metadata.withNamespace('kube-system') +
@@ -54,14 +59,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
name: 'kube-prometheus-eks.rules',
rules: [
{
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
labels: {
severity: 'critical',
},
annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
},
'for': $._config.eks.minimumAvailableIPsTime,
'for': '10m',
alert: 'EksAvailableIPs'
},
],

View File

@@ -50,7 +50,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
kind: 'ServiceMonitor',
metadata: {
name: 'etcd',
namespace: $._config.namespace,
namespace: 'kube-system',
labels: {
'k8s-app': 'etcd',
},

View File

@@ -9,6 +9,9 @@
'kube-rbac-proxy'+: {
limits: {},
},
'kube-state-metrics'+: {
limits: {},
},
'node-exporter'+: {
limits: {},
},

View File

@@ -78,8 +78,8 @@ local configMapList = k3.core.v1.configMapList;
// 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
],
cadvisorSelector: 'job="kubelet"',
kubeletSelector: 'job="kubelet"',
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
kubeStateMetricsSelector: 'job="kube-state-metrics"',
nodeExporterSelector: 'job="node-exporter"',
notKubeDnsSelector: 'job!="kube-dns"',
@@ -116,6 +116,10 @@ local configMapList = k3.core.v1.configMapList;
requests: { cpu: '10m', memory: '20Mi' },
limits: { cpu: '20m', memory: '40Mi' },
},
'kube-state-metrics': {
requests: { cpu: '100m', memory: '150Mi' },
limits: { cpu: '100m', memory: '150Mi' },
},
'node-exporter': {
requests: { cpu: '102m', memory: '180Mi' },
limits: { cpu: '250m', memory: '180Mi' },

View File

@@ -8,13 +8,10 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
collectors: '', // empty string gets a default set
scrapeInterval: '30s',
scrapeTimeout: '30s',
baseCPU: '100m',
baseMemory: '150Mi',
},
versions+:: {
kubeStateMetrics: 'v1.8.0',
kubeStateMetrics: 'v1.9.2',
kubeRbacProxy: 'v0.4.1',
},
@@ -125,6 +122,22 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
rulesType.withApiGroups(['storage.k8s.io']) +
rulesType.withResources([
'storageclasses',
'volumeattachments',
]) +
rulesType.withVerbs(['list', 'watch']),
rulesType.new() +
rulesType.withApiGroups(['admissionregistration.k8s.io']) +
rulesType.withResources([
'validatingwebhookconfigurations',
'mutatingwebhookconfigurations',
]) +
rulesType.withVerbs(['list', 'watch']),
rulesType.new() +
rulesType.withApiGroups(['networking.k8s.io']) +
rulesType.withResources([
'networkpolicies',
]) +
rulesType.withVerbs(['list', 'watch']),
];
@@ -174,8 +187,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
'--telemetry-host=127.0.0.1',
'--telemetry-port=8082',
] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) +
container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) +
container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory });
container.mixin.resources.withRequests($._config.resources['kube-state-metrics'].requests) +
container.mixin.resources.withLimits($._config.resources['kube-state-metrics'].limits);
local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics];

View File

@@ -89,7 +89,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
'--path.procfs=/host/proc',
'--path.sysfs=/host/sys',
'--path.rootfs=/host/root',
'--no-collector.wifi',
'--no-collector.hwmon',
// The following settings have been taken from
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
// Once node exporter is being released with those settings, this can be removed.
@@ -168,7 +169,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
port: 'https',
scheme: 'https',
interval: '30s',
interval: '15s',
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{

View File

@@ -191,7 +191,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local rules =
policyRule.new() +
policyRule.withApiGroups(['metrics.k8s.io']) +
policyRule.withResources(['pods']) +
policyRule.withResources(['pods', 'nodes']) +
policyRule.withVerbs(['get','list','watch']);
clusterRole.new() +

View File

@@ -160,6 +160,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local resourceRequirements = container.mixin.resourcesType;
local selector = statefulSet.mixin.spec.selectorType;
local resources =
resourceRequirements.new() +
resourceRequirements.withRequests({ memory: '400Mi' });
@@ -284,10 +285,11 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'),
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path'
targetLabel: 'metrics_path',
},
],
},
@@ -304,7 +306,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path'
targetLabel: 'metrics_path',
},
],
metricRelabelings: [
@@ -347,7 +349,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
port: 'http-metrics',
interval: '30s',
metricRelabelings: [
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',
@@ -402,7 +404,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
serverName: 'kubernetes',
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: [
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',
@@ -418,6 +420,11 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
regex: 'apiserver_admission_step_admission_latencies_seconds_.*',
action: 'drop',
},
{
sourceLabels: ['__name__', 'le'],
regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)',
action: 'drop',
},
],
},
],

View File

@@ -8,8 +8,8 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "cbc1340af53f50728181f97f6bce442ac33d8993",
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
"version": "5770a6d286fe48682e29b54ce0df37e7d24b3280",
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
},
{
"name": "grafana",
@@ -30,7 +30,7 @@
"subdir": "grafana-builder"
}
},
"version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
"version": "676ff4b4fe9135f85a5d6e30523d64d2d3713087",
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
},
{
@@ -41,8 +41,8 @@
"subdir": "grafonnet"
}
},
"version": "b82411476842f583817e67feff5becf1228fd540",
"sum": "mEosZ6hZCTCw8AaASEtRFjY8PSmpvqI3xj6IWpwcroU="
"version": "f3ee1d810858cf556d25f045b53cb0f1fd10b94e",
"sum": "14YBZUP/cl8qi9u86xiuUS4eXQrEAam+4GSg6i9n9Ys="
},
{
"name": "ksonnet",
@@ -72,8 +72,8 @@
"subdir": ""
}
},
"version": "a132ade95740f9364e477ae8e730eabd650d14cb",
"sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
"version": "68f82d2a428d91df57e9af43739981a6a8ede897",
"sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM="
},
{
"name": "node-mixin",
@@ -83,7 +83,7 @@
"subdir": "docs/node-mixin"
}
},
"version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
"version": "2cae917bb7e0b6379221e8a24da012b16e63d661",
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
},
{
@@ -94,8 +94,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "431844f0a7c289e4255a68f09a18fcca09637fb2",
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
"version": "31700a05df64c2b4e32bb0ecd8baa25279144778",
"sum": "/cohvDTaIiLElG66tKeQsi4v1M9mlGDKjOBSWivL9TU="
},
{
"name": "prometheus-operator",
@@ -116,8 +116,19 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
"version": "a7ee9d1abe1b1a3670a02ede1135cadb660b9d0c",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
"name": "slo-libsonnet",
"source": {
"git": {
"remote": "https://github.com/metalmatze/slo-libsonnet",
"subdir": "slo-libsonnet"
}
},
"version": "437c402c5f3ad86c3c16db8471f1649284fef0ee",
"sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q="
}
]
}

View File

@@ -1,6 +1,6 @@
apiVersion: v1
data:
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gIm5hbWVzcGFjZSIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
kind: Secret
metadata:
name: alertmanager-main

File diff suppressed because it is too large Load Diff

View File

@@ -86,6 +86,22 @@ rules:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
- mutatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
verbs:
- list
- watch

View File

@@ -55,7 +55,7 @@ spec:
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.8.0
image: quay.io/coreos/kube-state-metrics:v1.9.2
name: kube-state-metrics
resources:
limits:

View File

@@ -20,6 +20,8 @@ spec:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
image: quay.io/prometheus/node-exporter:v0.18.1

View File

@@ -8,7 +8,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 15s
port: https
relabelings:
- action: replace

View File

@@ -11,6 +11,7 @@ rules:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list

View File

@@ -40,10 +40,10 @@ spec:
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
@@ -68,66 +68,74 @@ spec:
- name: kube-apiserver.rules
rules:
- expr: |
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
record: cluster:apiserver_request_duration_seconds:mean5m
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- name: k8s.rules
rules:
- expr: |
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr: |
container_memory_working_set_bytes{job="kubelet", image!=""}
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: |
container_memory_rss{job="kubelet", image!=""}
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record: node_namespace_pod_container:container_memory_rss
- expr: |
container_memory_cache{job="kubelet", image!=""}
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record: node_namespace_pod_container:container_memory_cache
- expr: |
container_memory_swap{job="kubelet", image!=""}
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record: node_namespace_pod_container:container_memory_swap
- expr: |
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
* on (namespace, pod)
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
sum by (namespace) (
sum by (namespace, pod) (
max by (namespace, pod, container) (
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
) * on(namespace, pod) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
* on (namespace, pod)
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
sum by (namespace) (
sum by (namespace, pod) (
max by (namespace, pod, container) (
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
) * on(namespace, pod) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
@@ -139,7 +147,7 @@ spec:
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
) by (cluster, namespace, workload, pod)
labels:
workload_type: deployment
record: mixin_pod_workload
@@ -149,7 +157,7 @@ spec:
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
) by (cluster, namespace, workload, pod)
labels:
workload_type: daemonset
record: mixin_pod_workload
@@ -159,7 +167,7 @@ spec:
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
) by (cluster, namespace, workload, pod)
labels:
workload_type: statefulset
record: mixin_pod_workload
@@ -212,13 +220,14 @@ spec:
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
- expr: |
sum(min(kube_pod_info) by (cluster, node))
record: ':kube_pod_info_node_count:'
- expr: |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
count by (cluster, node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
@@ -233,7 +242,7 @@ spec:
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
)
) by (cluster)
record: :node_memory_MemAvailable_bytes:sum
- name: kube-prometheus-node-recording.rules
rules:
@@ -701,9 +710,9 @@ spec:
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: |
kubelet_volume_stats_available_bytes{job="kubelet"}
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
< 0.03
for: 1m
labels:
@@ -716,12 +725,12 @@ spec:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet"}
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.15
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
severity: critical
@@ -760,6 +769,156 @@ spec:
for: 15m
labels:
severity: warning
- name: kube-apiserver-error
rules:
- alert: ErrorBudgetBurn
annotations:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
)
labels:
job: apiserver
severity: critical
- alert: ErrorBudgetBurn
annotations:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
)
labels:
job: apiserver
severity: warning
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[5m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate5m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[30m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate30m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[2h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate2h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[6h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate6h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1d
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[3d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate3d
- expr: |
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate5m
- expr: |
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate30m
- expr: |
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1h
- expr: |
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate2h
- expr: |
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate6h
- expr: |
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1d
- expr: |
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate3d
- name: kubernetes-system-apiserver
rules:
- alert: KubeAPILatencyHigh
@@ -898,7 +1057,7 @@ spec:
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
for: 15m
labels:
severity: warning
@@ -907,7 +1066,7 @@ spec:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
expr: |
absent(up{job="kubelet"} == 1)
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical
@@ -1140,13 +1299,10 @@ spec:
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: |
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr: |
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical

View File

@@ -10,6 +10,38 @@ spec:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|request|server).*
sourceLabels:
@@ -22,6 +54,11 @@ spec:
regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https
scheme: https
tlsConfig:

View File

@@ -9,6 +9,38 @@ spec:
endpoints:
- interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|request|server).*
sourceLabels:

View File

@@ -10,6 +10,39 @@ spec:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
port: https-metrics
relabelings:
- sourceLabels:

View File

@@ -17,6 +17,7 @@ package e2e
import (
"log"
"os"
"strings"
"testing"
"time"
@@ -57,23 +58,22 @@ func testMain(m *testing.M) int {
}
func TestQueryPrometheus(t *testing.T) {
t.Parallel()
queries := []struct {
query string
expectN int
}{
{
// query: `up{job="node-exporter"} == 1`,
// expectN: 1,
// }, {
query: `up{job="node-exporter"} == 1`,
expectN: 1,
}, {
// query: `up{job="kubelet"} == 1`,
// expectN: 1,
// }, {
query: `up{job="apiserver"} == 1`,
expectN: 1,
// }, {
// query: `up{job="kube-state-metrics"} == 1`,
// expectN: 1,
}, {
query: `up{job="kube-state-metrics"} == 1`,
expectN: 1,
}, {
query: `up{job="prometheus-k8s"} == 1`,
expectN: 1,
@@ -116,3 +116,25 @@ func TestQueryPrometheus(t *testing.T) {
t.Fatal(err)
}
}
func TestDroppedMetrics(t *testing.T) {
// query metadata for all metrics and their metadata
md, err := promClient.metadata("{job=~\".+\"}")
if err != nil {
log.Fatal(err)
}
for _, k := range md.Data {
// check if the metric' help text contains Deprecated
if strings.Contains(k.Help, "Deprecated") {
// query prometheus for the Deprecated metric
n, err := promClient.query(k.Metric)
if err != nil {
log.Fatal(err)
}
if n > 0 {
t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help)
}
}
}
}

View File

@@ -15,6 +15,10 @@
package e2e
import (
"bytes"
"encoding/json"
"fmt"
"k8s.io/client-go/kubernetes"
"github.com/Jeffail/gabs"
@@ -50,3 +54,41 @@ func (c *prometheusClient) query(query string) (int, error) {
n, err := res.ArrayCountP("data.result")
return n, err
}
type Metadata struct {
Status string `json:"status,omitempty"`
Data []Data `json:"data,omitempty"`
}
type Data struct {
Metric string `json:"metric,omitempty"`
Help string `json:"help,omitempty"`
}
// metadata makes a request against the Prometheus /api/v1/targets/metadata endpoint.
// It returns all the metrics and its metadata.
func (c *prometheusClient) metadata(query string) (Metadata, error) {
req := c.kubeClient.CoreV1().RESTClient().Get().
Namespace("monitoring").
Resource("pods").
SubResource("proxy").
Name("prometheus-k8s-0:9090").
Suffix("/api/v1/targets/metadata").Param("match_target", query)
var data Metadata
b, err := req.DoRaw()
if err != nil {
return data, err
}
r := bytes.NewReader(b)
decoder := json.NewDecoder(r)
err = decoder.Decode(&data)
if err != nil {
return data, err
}
if data.Status != "success" {
return data, fmt.Errorf("status of returned response was not successful; status: %s", data.Status)
}
return data, err
}

View File

@@ -33,7 +33,7 @@ run_e2e_tests() {
./kind delete cluster
}
cluster_compatible_versions=("v1.14.1" "v1.15.0" "v1.16.1" "v1.17.0")
cluster_compatible_versions=("v1.16.1" "v1.17.0")
for cluster_version in "${cluster_compatible_versions[@]}"
do