Compare commits
89 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee8077db04 | ||
|
|
d3bee7fa1a | ||
|
|
106132ac18 | ||
|
|
8961be9639 | ||
|
|
ae589e91ce | ||
|
|
8367575768 | ||
|
|
6b5033d65e | ||
|
|
68d6e611c6 | ||
|
|
f2b4528b63 | ||
|
|
be8eb39024 | ||
|
|
629e86e53a | ||
|
|
a7628e0223 | ||
|
|
8984606f5d | ||
|
|
48d95f0b9f | ||
|
|
e410043b6b | ||
|
|
894069f24d | ||
|
|
d074ea1427 | ||
|
|
269aef6e37 | ||
|
|
90e5982de4 | ||
|
|
7165938b39 | ||
|
|
9ebe632d5d | ||
|
|
72ae778bfc | ||
|
|
0608c96bf6 | ||
|
|
44f3c61010 | ||
|
|
f517b35a42 | ||
|
|
54c0fda307 | ||
|
|
6a3d667d3e | ||
|
|
d9d3139dc8 | ||
|
|
67ed0f63c2 | ||
|
|
7788d0d327 | ||
|
|
fca505f2a2 | ||
|
|
d40548d3a0 | ||
|
|
dba42d3477 | ||
|
|
ee37661c34 | ||
|
|
8b36950f0e | ||
|
|
932745172d | ||
|
|
1af59f3130 | ||
|
|
6562b02da8 | ||
|
|
23999e44df | ||
|
|
69d3357892 | ||
|
|
3465b0fa0d | ||
|
|
1d1ce4967f | ||
|
|
3a0e6ba91f | ||
|
|
81e2d19398 | ||
|
|
92d4cbae08 | ||
|
|
2e72a8a832 | ||
|
|
9493a1a5f7 | ||
|
|
0a48577bb7 | ||
|
|
9211c42df0 | ||
|
|
5cddfd8da7 | ||
|
|
bd69007c8c | ||
|
|
4f2b9c1ec8 | ||
|
|
0be63d47fc | ||
|
|
5fe60f37a2 | ||
|
|
200fee8d7c | ||
|
|
1b9be6d00b | ||
|
|
ce68c4b392 | ||
|
|
5e9b883528 | ||
|
|
69b0ba03f1 | ||
|
|
3279f222a0 | ||
|
|
543ccec970 | ||
|
|
f17ddfd293 | ||
|
|
3b8530d742 | ||
|
|
44fe363211 | ||
|
|
326453cf47 | ||
|
|
159a14ef47 | ||
|
|
d03d57e6bb | ||
|
|
31cb71fcd9 | ||
|
|
4474b24a32 | ||
|
|
339ade5a81 | ||
|
|
ce7c5fa3b4 | ||
|
|
3f388b797d | ||
|
|
20abdf3b72 | ||
|
|
cd0f3c641e | ||
|
|
408fde189b | ||
|
|
90cf0ae21c | ||
|
|
3ba4b5602a | ||
|
|
cb0e6e2c89 | ||
|
|
03f7adcf92 | ||
|
|
fd267aebeb | ||
|
|
420425d88e | ||
|
|
965bec0ad7 | ||
|
|
d22bad8293 | ||
|
|
8c255e9e6c | ||
|
|
56027ac757 | ||
|
|
50b06b0d33 | ||
|
|
6f6fd65a48 | ||
|
|
f48fe057dc | ||
|
|
8487871388 |
2
Makefile
2
Makefile
@@ -15,7 +15,7 @@ CONTAINER_CMD:=docker run --rm \
|
||||
-v "$(shell go env GOCACHE):/.cache/go-build" \
|
||||
-v "$(PWD):/go/src/github.com/coreos/kube-prometheus:Z" \
|
||||
-w "/go/src/github.com/coreos/kube-prometheus" \
|
||||
quay.io/coreos/jsonnet-ci
|
||||
quay.io/coreos/jsonnet-ci:release-0.36
|
||||
|
||||
all: generate fmt test
|
||||
|
||||
|
||||
@@ -236,8 +236,13 @@ The previous steps (compilation) has created a bunch of manifest files in the ma
|
||||
Now simply use `kubectl` to install Prometheus and Grafana as per your configuration:
|
||||
|
||||
```shell
|
||||
# Update the namespace and CRDs, and then wait for them to be availble before creating the remaining resources
|
||||
$ kubectl apply -f manifests/setup
|
||||
$ kubectl apply -f manifests/
|
||||
```
|
||||
Alternatively, the resources in both folders can be applied with a single command
|
||||
`kubectl apply -Rf manifests`, but it may be necessary to run the command multiple times for all components to
|
||||
be created successfullly.
|
||||
|
||||
Check the monitoring namespace (or the namespace you have specific in `namespace: `) and make sure the pods are running. Prometheus and Grafana should be up and running soon.
|
||||
|
||||
|
||||
@@ -14,12 +14,16 @@ spec:
|
||||
port: 8080
|
||||
targetPort: web
|
||||
---
|
||||
apiVersion: extensions/v1beta1
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: example-app
|
||||
namespace: default
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: example-app
|
||||
version: 1.1.3
|
||||
replicas: 4
|
||||
template:
|
||||
metadata:
|
||||
|
||||
@@ -14,6 +14,14 @@ rules:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- "apps"
|
||||
resources:
|
||||
- deployments
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- "extensions"
|
||||
resources:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: metrics-server
|
||||
|
||||
@@ -19,7 +19,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
resolve_timeout: '5m',
|
||||
},
|
||||
route: {
|
||||
group_by: ['job'],
|
||||
group_by: ['namespace'],
|
||||
group_wait: '30s',
|
||||
group_interval: '5m',
|
||||
repeat_interval: '12h',
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
[
|
||||
// Drop all kubelet metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all scheduler metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all apiserver metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all docker metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all reflector metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all etcd metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all transformation metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'transformation_(transformation_latencies_microseconds|failures_total)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all other metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)',
|
||||
action: 'drop',
|
||||
},
|
||||
]
|
||||
@@ -0,0 +1,20 @@
|
||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
|
||||
{
|
||||
prometheus+:: {
|
||||
clusterRole+: {
|
||||
rules+:
|
||||
local role = k.rbac.v1.role;
|
||||
local policyRule = role.rulesType;
|
||||
local rule = policyRule.new() +
|
||||
policyRule.withApiGroups(['']) +
|
||||
policyRule.withResources([
|
||||
'services',
|
||||
'endpoints',
|
||||
'pods',
|
||||
]) +
|
||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
||||
[rule]
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,17 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
|
||||
{
|
||||
prometheus+: {
|
||||
serviceMonitorCoreDNS+: {
|
||||
spec+: {
|
||||
endpoints: [
|
||||
{
|
||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
||||
interval: "15s",
|
||||
targetPort: 9153
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
AwsEksCniMetricService:
|
||||
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
'kube-rbac-proxy'+: {
|
||||
limits: {},
|
||||
},
|
||||
'kube-state-metrics'+: {
|
||||
limits: {},
|
||||
},
|
||||
'node-exporter'+: {
|
||||
limits: {},
|
||||
},
|
||||
|
||||
@@ -78,8 +78,8 @@ local configMapList = k3.core.v1.configMapList;
|
||||
// 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
|
||||
],
|
||||
|
||||
cadvisorSelector: 'job="kubelet"',
|
||||
kubeletSelector: 'job="kubelet"',
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
notKubeDnsSelector: 'job!="kube-dns"',
|
||||
@@ -116,6 +116,10 @@ local configMapList = k3.core.v1.configMapList;
|
||||
requests: { cpu: '10m', memory: '20Mi' },
|
||||
limits: { cpu: '20m', memory: '40Mi' },
|
||||
},
|
||||
'kube-state-metrics': {
|
||||
requests: { cpu: '100m', memory: '150Mi' },
|
||||
limits: { cpu: '100m', memory: '150Mi' },
|
||||
},
|
||||
'node-exporter': {
|
||||
requests: { cpu: '102m', memory: '180Mi' },
|
||||
limits: { cpu: '250m', memory: '180Mi' },
|
||||
|
||||
@@ -8,13 +8,10 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
collectors: '', // empty string gets a default set
|
||||
scrapeInterval: '30s',
|
||||
scrapeTimeout: '30s',
|
||||
|
||||
baseCPU: '100m',
|
||||
baseMemory: '150Mi',
|
||||
},
|
||||
|
||||
versions+:: {
|
||||
kubeStateMetrics: 'v1.8.0',
|
||||
kubeStateMetrics: 'v1.9.2',
|
||||
kubeRbacProxy: 'v0.4.1',
|
||||
},
|
||||
|
||||
@@ -125,6 +122,22 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
rulesType.withApiGroups(['storage.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'storageclasses',
|
||||
'volumeattachments',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
|
||||
rulesType.new() +
|
||||
rulesType.withApiGroups(['admissionregistration.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'validatingwebhookconfigurations',
|
||||
'mutatingwebhookconfigurations',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
|
||||
rulesType.new() +
|
||||
rulesType.withApiGroups(['networking.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'networkpolicies',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
];
|
||||
@@ -174,8 +187,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
'--telemetry-host=127.0.0.1',
|
||||
'--telemetry-port=8082',
|
||||
] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) +
|
||||
container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) +
|
||||
container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory });
|
||||
container.mixin.resources.withRequests($._config.resources['kube-state-metrics'].requests) +
|
||||
container.mixin.resources.withLimits($._config.resources['kube-state-metrics'].limits);
|
||||
|
||||
local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics];
|
||||
|
||||
|
||||
@@ -89,7 +89,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
'--path.procfs=/host/proc',
|
||||
'--path.sysfs=/host/sys',
|
||||
'--path.rootfs=/host/root',
|
||||
|
||||
'--no-collector.wifi',
|
||||
'--no-collector.hwmon',
|
||||
// The following settings have been taken from
|
||||
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
|
||||
// Once node exporter is being released with those settings, this can be removed.
|
||||
@@ -105,7 +106,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) +
|
||||
container.withArgs([
|
||||
'--logtostderr',
|
||||
'--secure-listen-address=$(IP):' + $._config.nodeExporter.port,
|
||||
'--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port,
|
||||
'--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites),
|
||||
'--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/',
|
||||
]) +
|
||||
@@ -168,7 +169,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
{
|
||||
port: 'https',
|
||||
scheme: 'https',
|
||||
interval: '30s',
|
||||
interval: '15s',
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
relabelings: [
|
||||
{
|
||||
|
||||
@@ -191,7 +191,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local rules =
|
||||
policyRule.new() +
|
||||
policyRule.withApiGroups(['metrics.k8s.io']) +
|
||||
policyRule.withResources(['pods']) +
|
||||
policyRule.withResources(['pods', 'nodes']) +
|
||||
policyRule.withVerbs(['get','list','watch']);
|
||||
|
||||
clusterRole.new() +
|
||||
|
||||
@@ -160,6 +160,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local resourceRequirements = container.mixin.resourcesType;
|
||||
local selector = statefulSet.mixin.spec.selectorType;
|
||||
|
||||
|
||||
local resources =
|
||||
resourceRequirements.new() +
|
||||
resourceRequirements.withRequests({ memory: '400Mi' });
|
||||
@@ -182,6 +183,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
serviceMonitorSelector: {},
|
||||
podMonitorSelector: {},
|
||||
serviceMonitorNamespaceSelector: {},
|
||||
podMonitorNamespaceSelector: {},
|
||||
nodeSelector: { 'kubernetes.io/os': 'linux' },
|
||||
ruleSelector: selector.withMatchLabels({
|
||||
role: 'alert-rules',
|
||||
@@ -283,10 +285,11 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'),
|
||||
relabelings: [
|
||||
{
|
||||
sourceLabels: ['__metrics_path__'],
|
||||
targetLabel: 'metrics_path'
|
||||
targetLabel: 'metrics_path',
|
||||
},
|
||||
],
|
||||
},
|
||||
@@ -303,7 +306,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
relabelings: [
|
||||
{
|
||||
sourceLabels: ['__metrics_path__'],
|
||||
targetLabel: 'metrics_path'
|
||||
targetLabel: 'metrics_path',
|
||||
},
|
||||
],
|
||||
metricRelabelings: [
|
||||
@@ -346,7 +349,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
{
|
||||
port: 'http-metrics',
|
||||
interval: '30s',
|
||||
metricRelabelings: [
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(debugging|disk|request|server).*',
|
||||
@@ -401,7 +404,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
serverName: 'kubernetes',
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
metricRelabelings: [
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(debugging|disk|request|server).*',
|
||||
@@ -417,6 +420,11 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
regex: 'apiserver_admission_step_admission_latencies_seconds_.*',
|
||||
action: 'drop',
|
||||
},
|
||||
{
|
||||
sourceLabels: ['__name__', 'le'],
|
||||
regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)',
|
||||
action: 'drop',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
|
||||
19
jsonnet/kube-prometheus/rules/general.libsonnet
Normal file
19
jsonnet/kube-prometheus/rules/general.libsonnet
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-general.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'count without(instance, pod, node) (up == 1)',
|
||||
record: 'count:up1',
|
||||
},
|
||||
{
|
||||
expr: 'count without(instance, pod, node) (up == 0)',
|
||||
record: 'count:up0',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -1 +1,2 @@
|
||||
(import 'node-rules.libsonnet')
|
||||
(import 'node-rules.libsonnet') +
|
||||
(import 'general.libsonnet')
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "cbc1340af53f50728181f97f6bce442ac33d8993",
|
||||
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
|
||||
"version": "5770a6d286fe48682e29b54ce0df37e7d24b3280",
|
||||
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@@ -30,7 +30,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
|
||||
"version": "676ff4b4fe9135f85a5d6e30523d64d2d3713087",
|
||||
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
|
||||
},
|
||||
{
|
||||
@@ -41,8 +41,8 @@
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "b82411476842f583817e67feff5becf1228fd540",
|
||||
"sum": "mEosZ6hZCTCw8AaASEtRFjY8PSmpvqI3xj6IWpwcroU="
|
||||
"version": "f3ee1d810858cf556d25f045b53cb0f1fd10b94e",
|
||||
"sum": "14YBZUP/cl8qi9u86xiuUS4eXQrEAam+4GSg6i9n9Ys="
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
@@ -72,8 +72,8 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
|
||||
"sum": "qfm0EpLrEZ1+fe93LFLa9tyOalK6JehpholxO2d0xXU="
|
||||
"version": "68f82d2a428d91df57e9af43739981a6a8ede897",
|
||||
"sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM="
|
||||
},
|
||||
{
|
||||
"name": "node-mixin",
|
||||
@@ -83,7 +83,7 @@
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
|
||||
"version": "2cae917bb7e0b6379221e8a24da012b16e63d661",
|
||||
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
|
||||
},
|
||||
{
|
||||
@@ -94,8 +94,8 @@
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "431844f0a7c289e4255a68f09a18fcca09637fb2",
|
||||
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
|
||||
"version": "31700a05df64c2b4e32bb0ecd8baa25279144778",
|
||||
"sum": "/cohvDTaIiLElG66tKeQsi4v1M9mlGDKjOBSWivL9TU="
|
||||
},
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
@@ -116,8 +116,19 @@
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
|
||||
"version": "a7ee9d1abe1b1a3670a02ede1135cadb660b9d0c",
|
||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||
},
|
||||
{
|
||||
"name": "slo-libsonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/metalmatze/slo-libsonnet",
|
||||
"subdir": "slo-libsonnet"
|
||||
}
|
||||
},
|
||||
"version": "437c402c5f3ad86c3c16db8471f1649284fef0ee",
|
||||
"sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q="
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
|
||||
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gIm5hbWVzcGFjZSIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -86,6 +86,22 @@ rules:
|
||||
- storage.k8s.io
|
||||
resources:
|
||||
- storageclasses
|
||||
- volumeattachments
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- admissionregistration.k8s.io
|
||||
resources:
|
||||
- validatingwebhookconfigurations
|
||||
- mutatingwebhookconfigurations
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- networkpolicies
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
|
||||
@@ -55,7 +55,7 @@ spec:
|
||||
- --port=8081
|
||||
- --telemetry-host=127.0.0.1
|
||||
- --telemetry-port=8082
|
||||
image: quay.io/coreos/kube-state-metrics:v1.8.0
|
||||
image: quay.io/coreos/kube-state-metrics:v1.9.2
|
||||
name: kube-state-metrics
|
||||
resources:
|
||||
limits:
|
||||
|
||||
@@ -20,6 +20,8 @@ spec:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
image: quay.io/prometheus/node-exporter:v0.18.1
|
||||
@@ -44,7 +46,7 @@ spec:
|
||||
readOnly: true
|
||||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=$(IP):9100
|
||||
- --secure-listen-address=[$(IP)]:9100
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --upstream=http://127.0.0.1:9100/
|
||||
env:
|
||||
|
||||
@@ -8,7 +8,7 @@ metadata:
|
||||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
interval: 15s
|
||||
port: https
|
||||
relabelings:
|
||||
- action: replace
|
||||
|
||||
@@ -11,6 +11,7 @@ rules:
|
||||
- metrics.k8s.io
|
||||
resources:
|
||||
- pods
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
|
||||
@@ -14,6 +14,7 @@ spec:
|
||||
baseImage: quay.io/prometheus/prometheus
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
podMonitorNamespaceSelector: {}
|
||||
podMonitorSelector: {}
|
||||
replicas: 2
|
||||
resources:
|
||||
|
||||
@@ -68,61 +68,74 @@ spec:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
sum by (namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
|
||||
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="kubelet", image!=""}
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="kubelet", image!=""}
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="kubelet", image!=""}
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="kubelet", image!=""}
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
@@ -134,7 +147,7 @@ spec:
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
@@ -144,7 +157,7 @@ spec:
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
@@ -154,7 +167,7 @@ spec:
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
@@ -207,13 +220,14 @@ spec:
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: sum(min(kube_pod_info) by (node))
|
||||
- expr: |
|
||||
sum(min(kube_pod_info) by (cluster, node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (node) (sum by (node, cpu) (
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
@@ -228,7 +242,7 @@ spec:
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
@@ -251,6 +265,12 @@ spec:
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
@@ -425,7 +445,7 @@ spec:
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
|
||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -690,9 +710,9 @@ spec:
|
||||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
< 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
@@ -705,12 +725,12 @@ spec:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -749,16 +769,180 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-error
|
||||
rules:
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: critical
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: warning
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
message: The API server has an abnormal latency of {{ $value }} seconds for
|
||||
{{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
|
||||
for: 10m
|
||||
(
|
||||
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
|
||||
>
|
||||
on (verb) group_left()
|
||||
(
|
||||
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
+
|
||||
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
)
|
||||
) > on (verb) group_left()
|
||||
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
and on (verb,resource)
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
|
||||
>
|
||||
1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPILatencyHigh
|
||||
@@ -767,7 +951,7 @@ spec:
|
||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -873,7 +1057,7 @@ spec:
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -882,7 +1066,7 @@ spec:
|
||||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
||||
expr: |
|
||||
absent(up{job="kubelet"} == 1)
|
||||
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
@@ -10,6 +10,38 @@ spec:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
@@ -22,6 +54,11 @@ spec:
|
||||
regex: apiserver_admission_step_admission_latencies_seconds_.*
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- le
|
||||
port: https
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
||||
@@ -9,6 +9,38 @@ spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
|
||||
@@ -10,6 +10,39 @@ spec:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
port: https-metrics
|
||||
relabelings:
|
||||
- sourceLabels:
|
||||
|
||||
@@ -17,6 +17,7 @@ package e2e
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -57,23 +58,22 @@ func testMain(m *testing.M) int {
|
||||
}
|
||||
|
||||
func TestQueryPrometheus(t *testing.T) {
|
||||
t.Parallel()
|
||||
queries := []struct {
|
||||
query string
|
||||
expectN int
|
||||
}{
|
||||
{
|
||||
// query: `up{job="node-exporter"} == 1`,
|
||||
// expectN: 1,
|
||||
// }, {
|
||||
query: `up{job="node-exporter"} == 1`,
|
||||
expectN: 1,
|
||||
}, {
|
||||
// query: `up{job="kubelet"} == 1`,
|
||||
// expectN: 1,
|
||||
// }, {
|
||||
query: `up{job="apiserver"} == 1`,
|
||||
expectN: 1,
|
||||
// }, {
|
||||
// query: `up{job="kube-state-metrics"} == 1`,
|
||||
// expectN: 1,
|
||||
}, {
|
||||
query: `up{job="kube-state-metrics"} == 1`,
|
||||
expectN: 1,
|
||||
}, {
|
||||
query: `up{job="prometheus-k8s"} == 1`,
|
||||
expectN: 1,
|
||||
@@ -87,7 +87,7 @@ func TestQueryPrometheus(t *testing.T) {
|
||||
}
|
||||
|
||||
// Wait for pod to respond at queries at all. Then start verifying their results.
|
||||
err := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
|
||||
err := wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) {
|
||||
_, err := promClient.query("up")
|
||||
return err == nil, nil
|
||||
})
|
||||
@@ -116,3 +116,25 @@ func TestQueryPrometheus(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDroppedMetrics(t *testing.T) {
|
||||
// query metadata for all metrics and their metadata
|
||||
md, err := promClient.metadata("{job=~\".+\"}")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, k := range md.Data {
|
||||
// check if the metric' help text contains Deprecated
|
||||
if strings.Contains(k.Help, "Deprecated") {
|
||||
// query prometheus for the Deprecated metric
|
||||
n, err := promClient.query(k.Metric)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if n > 0 {
|
||||
t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,10 @@
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/client-go/kubernetes"
|
||||
|
||||
"github.com/Jeffail/gabs"
|
||||
@@ -50,3 +54,41 @@ func (c *prometheusClient) query(query string) (int, error) {
|
||||
n, err := res.ArrayCountP("data.result")
|
||||
return n, err
|
||||
}
|
||||
|
||||
type Metadata struct {
|
||||
Status string `json:"status,omitempty"`
|
||||
Data []Data `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
type Data struct {
|
||||
Metric string `json:"metric,omitempty"`
|
||||
Help string `json:"help,omitempty"`
|
||||
}
|
||||
|
||||
// metadata makes a request against the Prometheus /api/v1/targets/metadata endpoint.
|
||||
// It returns all the metrics and its metadata.
|
||||
func (c *prometheusClient) metadata(query string) (Metadata, error) {
|
||||
req := c.kubeClient.CoreV1().RESTClient().Get().
|
||||
Namespace("monitoring").
|
||||
Resource("pods").
|
||||
SubResource("proxy").
|
||||
Name("prometheus-k8s-0:9090").
|
||||
Suffix("/api/v1/targets/metadata").Param("match_target", query)
|
||||
|
||||
var data Metadata
|
||||
b, err := req.DoRaw()
|
||||
if err != nil {
|
||||
return data, err
|
||||
}
|
||||
|
||||
r := bytes.NewReader(b)
|
||||
decoder := json.NewDecoder(r)
|
||||
err = decoder.Decode(&data)
|
||||
if err != nil {
|
||||
return data, err
|
||||
}
|
||||
if data.Status != "success" {
|
||||
return data, fmt.Errorf("status of returned response was not successful; status: %s", data.Status)
|
||||
}
|
||||
return data, err
|
||||
}
|
||||
|
||||
@@ -10,19 +10,33 @@ set -x
|
||||
|
||||
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
|
||||
chmod +x kubectl
|
||||
curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.4.0/kind-linux-amd64
|
||||
curl -Lo kind https://github.com/kubernetes-sigs/kind/releases/download/v0.6.1/kind-linux-amd64
|
||||
chmod +x kind
|
||||
|
||||
./kind create cluster
|
||||
export KUBECONFIG="$(./kind get kubeconfig-path)"
|
||||
run_e2e_tests() {
|
||||
cluster_version=$1
|
||||
|
||||
# create namespace, permissions, and CRDs
|
||||
./kubectl create -f manifests/setup
|
||||
./kind create cluster --image=kindest/node:$cluster_version
|
||||
export KUBECONFIG="$(./kind get kubeconfig-path)"
|
||||
|
||||
# wait for CRD creation to complete
|
||||
until ./kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done
|
||||
# create namespace, permissions, and CRDs
|
||||
./kubectl create -f manifests/setup
|
||||
|
||||
# create monitoring components
|
||||
./kubectl create -f manifests/
|
||||
# wait for CRD creation to complete
|
||||
until ./kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done
|
||||
|
||||
# create monitoring components
|
||||
./kubectl create -f manifests/
|
||||
|
||||
make test-e2e
|
||||
|
||||
./kind delete cluster
|
||||
}
|
||||
|
||||
cluster_compatible_versions=("v1.16.1" "v1.17.0")
|
||||
|
||||
for cluster_version in "${cluster_compatible_versions[@]}"
|
||||
do
|
||||
run_e2e_tests $cluster_version
|
||||
done
|
||||
|
||||
make test-e2e
|
||||
|
||||
Reference in New Issue
Block a user