Compare commits
24 Commits
v0.4.0
...
release-0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a95e9dada4 | ||
|
|
6fa8bfae2e | ||
|
|
53f18a4276 | ||
|
|
058439f108 | ||
|
|
e6529d950f | ||
|
|
ddd3fd7fe5 | ||
|
|
ca9bf08a8a | ||
|
|
4dd632afc0 | ||
|
|
e964410209 | ||
|
|
326eecf9af | ||
|
|
a98d4be60e | ||
|
|
3101c620c0 | ||
|
|
f3a47b9bba | ||
|
|
059149d37c | ||
|
|
1ade732468 | ||
|
|
f62ba1e136 | ||
|
|
ed71719c8e | ||
|
|
8fd8248928 | ||
|
|
d1b81cc1ac | ||
|
|
6a19c05248 | ||
|
|
989c6813aa | ||
|
|
5e5d1297f4 | ||
|
|
7a94c41e08 | ||
|
|
6b3cb71ab2 |
@@ -236,13 +236,8 @@ The previous steps (compilation) has created a bunch of manifest files in the ma
|
||||
Now simply use `kubectl` to install Prometheus and Grafana as per your configuration:
|
||||
|
||||
```shell
|
||||
# Update the namespace and CRDs, and then wait for them to be availble before creating the remaining resources
|
||||
$ kubectl apply -f manifests/setup
|
||||
$ kubectl apply -f manifests/
|
||||
```
|
||||
Alternatively, the resources in both folders can be applied with a single command
|
||||
`kubectl apply -Rf manifests`, but it may be necessary to run the command multiple times for all components to
|
||||
be created successfullly.
|
||||
|
||||
Check the monitoring namespace (or the namespace you have specific in `namespace: `) and make sure the pods are running. Prometheus and Grafana should be up and running soon.
|
||||
|
||||
|
||||
@@ -14,16 +14,12 @@ spec:
|
||||
port: 8080
|
||||
targetPort: web
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: example-app
|
||||
namespace: default
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: example-app
|
||||
version: 1.1.3
|
||||
replicas: 4
|
||||
template:
|
||||
metadata:
|
||||
|
||||
@@ -14,14 +14,6 @@ rules:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- "apps"
|
||||
resources:
|
||||
- deployments
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- "extensions"
|
||||
resources:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
apiVersion: apps/v1
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: metrics-server
|
||||
|
||||
@@ -19,7 +19,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
resolve_timeout: '5m',
|
||||
},
|
||||
route: {
|
||||
group_by: ['namespace'],
|
||||
group_by: ['job'],
|
||||
group_wait: '30s',
|
||||
group_interval: '5m',
|
||||
repeat_interval: '12h',
|
||||
|
||||
@@ -7,10 +7,15 @@
|
||||
{
|
||||
alert: 'AlertmanagerConfigInconsistent',
|
||||
annotations: {
|
||||
message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
|
||||
message: |||
|
||||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
|
||||
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
|
||||
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
|
||||
{{ end }}
|
||||
|||,
|
||||
},
|
||||
expr: |||
|
||||
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
[
|
||||
// Drop all kubelet metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all scheduler metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all apiserver metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all docker metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all reflector metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all etcd metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all transformation metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'transformation_(transformation_latencies_microseconds|failures_total)',
|
||||
action: 'drop',
|
||||
},
|
||||
// Drop all other metrics which are deprecated in kubernetes.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)',
|
||||
action: 'drop',
|
||||
},
|
||||
]
|
||||
@@ -18,7 +18,7 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
"version": "release-0.2"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@@ -48,7 +48,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
"version": "release-3.4"
|
||||
},
|
||||
{
|
||||
"name": "prometheus",
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
|
||||
{
|
||||
prometheus+:: {
|
||||
clusterRole+: {
|
||||
rules+:
|
||||
local role = k.rbac.v1.role;
|
||||
local policyRule = role.rulesType;
|
||||
local rule = policyRule.new() +
|
||||
policyRule.withApiGroups(['']) +
|
||||
policyRule.withResources([
|
||||
'services',
|
||||
'endpoints',
|
||||
'pods',
|
||||
]) +
|
||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
||||
[rule]
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -3,18 +3,13 @@ local service = k.core.v1.service;
|
||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
eks: {
|
||||
minimumAvailableIPs: 10,
|
||||
minimumAvailableIPsTime: '10m'
|
||||
}
|
||||
},
|
||||
prometheus+: {
|
||||
serviceMonitorCoreDNS+: {
|
||||
spec+: {
|
||||
endpoints: [
|
||||
{
|
||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
||||
interval: "15s",
|
||||
targetPort: 9153
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
AwsEksCniMetricService:
|
||||
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
@@ -59,14 +54,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
name: 'kube-prometheus-eks.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
|
||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
|
||||
},
|
||||
'for': '10m',
|
||||
'for': $._config.eks.minimumAvailableIPsTime,
|
||||
alert: 'EksAvailableIPs'
|
||||
},
|
||||
],
|
||||
|
||||
@@ -50,7 +50,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'etcd',
|
||||
namespace: 'kube-system',
|
||||
namespace: $._config.namespace,
|
||||
labels: {
|
||||
'k8s-app': 'etcd',
|
||||
},
|
||||
|
||||
@@ -9,9 +9,6 @@
|
||||
'kube-rbac-proxy'+: {
|
||||
limits: {},
|
||||
},
|
||||
'kube-state-metrics'+: {
|
||||
limits: {},
|
||||
},
|
||||
'node-exporter'+: {
|
||||
limits: {},
|
||||
},
|
||||
|
||||
@@ -78,8 +78,8 @@ local configMapList = k3.core.v1.configMapList;
|
||||
// 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
|
||||
],
|
||||
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
||||
cadvisorSelector: 'job="kubelet"',
|
||||
kubeletSelector: 'job="kubelet"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
notKubeDnsSelector: 'job!="kube-dns"',
|
||||
@@ -116,10 +116,6 @@ local configMapList = k3.core.v1.configMapList;
|
||||
requests: { cpu: '10m', memory: '20Mi' },
|
||||
limits: { cpu: '20m', memory: '40Mi' },
|
||||
},
|
||||
'kube-state-metrics': {
|
||||
requests: { cpu: '100m', memory: '150Mi' },
|
||||
limits: { cpu: '100m', memory: '150Mi' },
|
||||
},
|
||||
'node-exporter': {
|
||||
requests: { cpu: '102m', memory: '180Mi' },
|
||||
limits: { cpu: '250m', memory: '180Mi' },
|
||||
|
||||
@@ -8,10 +8,13 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
collectors: '', // empty string gets a default set
|
||||
scrapeInterval: '30s',
|
||||
scrapeTimeout: '30s',
|
||||
|
||||
baseCPU: '100m',
|
||||
baseMemory: '150Mi',
|
||||
},
|
||||
|
||||
versions+:: {
|
||||
kubeStateMetrics: 'v1.9.2',
|
||||
kubeStateMetrics: 'v1.8.0',
|
||||
kubeRbacProxy: 'v0.4.1',
|
||||
},
|
||||
|
||||
@@ -122,22 +125,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
rulesType.withApiGroups(['storage.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'storageclasses',
|
||||
'volumeattachments',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
|
||||
rulesType.new() +
|
||||
rulesType.withApiGroups(['admissionregistration.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'validatingwebhookconfigurations',
|
||||
'mutatingwebhookconfigurations',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
|
||||
rulesType.new() +
|
||||
rulesType.withApiGroups(['networking.k8s.io']) +
|
||||
rulesType.withResources([
|
||||
'networkpolicies',
|
||||
]) +
|
||||
rulesType.withVerbs(['list', 'watch']),
|
||||
];
|
||||
@@ -187,8 +174,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
'--telemetry-host=127.0.0.1',
|
||||
'--telemetry-port=8082',
|
||||
] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) +
|
||||
container.mixin.resources.withRequests($._config.resources['kube-state-metrics'].requests) +
|
||||
container.mixin.resources.withLimits($._config.resources['kube-state-metrics'].limits);
|
||||
container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) +
|
||||
container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory });
|
||||
|
||||
local c = [proxyClusterMetrics, proxySelfMetrics, kubeStateMetrics];
|
||||
|
||||
|
||||
@@ -89,8 +89,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
'--path.procfs=/host/proc',
|
||||
'--path.sysfs=/host/sys',
|
||||
'--path.rootfs=/host/root',
|
||||
'--no-collector.wifi',
|
||||
'--no-collector.hwmon',
|
||||
|
||||
// The following settings have been taken from
|
||||
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
|
||||
// Once node exporter is being released with those settings, this can be removed.
|
||||
@@ -169,7 +168,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
{
|
||||
port: 'https',
|
||||
scheme: 'https',
|
||||
interval: '15s',
|
||||
interval: '30s',
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
relabelings: [
|
||||
{
|
||||
|
||||
@@ -191,7 +191,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local rules =
|
||||
policyRule.new() +
|
||||
policyRule.withApiGroups(['metrics.k8s.io']) +
|
||||
policyRule.withResources(['pods', 'nodes']) +
|
||||
policyRule.withResources(['pods']) +
|
||||
policyRule.withVerbs(['get','list','watch']);
|
||||
|
||||
clusterRole.new() +
|
||||
|
||||
@@ -160,7 +160,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local resourceRequirements = container.mixin.resourcesType;
|
||||
local selector = statefulSet.mixin.spec.selectorType;
|
||||
|
||||
|
||||
local resources =
|
||||
resourceRequirements.new() +
|
||||
resourceRequirements.withRequests({ memory: '400Mi' });
|
||||
@@ -285,11 +284,10 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'),
|
||||
relabelings: [
|
||||
{
|
||||
sourceLabels: ['__metrics_path__'],
|
||||
targetLabel: 'metrics_path',
|
||||
targetLabel: 'metrics_path'
|
||||
},
|
||||
],
|
||||
},
|
||||
@@ -306,7 +304,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
relabelings: [
|
||||
{
|
||||
sourceLabels: ['__metrics_path__'],
|
||||
targetLabel: 'metrics_path',
|
||||
targetLabel: 'metrics_path'
|
||||
},
|
||||
],
|
||||
metricRelabelings: [
|
||||
@@ -349,7 +347,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
{
|
||||
port: 'http-metrics',
|
||||
interval: '30s',
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
metricRelabelings: [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(debugging|disk|request|server).*',
|
||||
@@ -404,7 +402,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
serverName: 'kubernetes',
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
metricRelabelings: [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(debugging|disk|request|server).*',
|
||||
@@ -420,11 +418,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
regex: 'apiserver_admission_step_admission_latencies_seconds_.*',
|
||||
action: 'drop',
|
||||
},
|
||||
{
|
||||
sourceLabels: ['__name__', 'le'],
|
||||
regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)',
|
||||
action: 'drop',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "5770a6d286fe48682e29b54ce0df37e7d24b3280",
|
||||
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
|
||||
"version": "cbc1340af53f50728181f97f6bce442ac33d8993",
|
||||
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@@ -30,7 +30,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "676ff4b4fe9135f85a5d6e30523d64d2d3713087",
|
||||
"version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd",
|
||||
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
|
||||
},
|
||||
{
|
||||
@@ -41,8 +41,8 @@
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "f3ee1d810858cf556d25f045b53cb0f1fd10b94e",
|
||||
"sum": "14YBZUP/cl8qi9u86xiuUS4eXQrEAam+4GSg6i9n9Ys="
|
||||
"version": "b82411476842f583817e67feff5becf1228fd540",
|
||||
"sum": "mEosZ6hZCTCw8AaASEtRFjY8PSmpvqI3xj6IWpwcroU="
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
@@ -72,8 +72,8 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "68f82d2a428d91df57e9af43739981a6a8ede897",
|
||||
"sum": "J/tuXi0Z8GRHo63pM17YFIyk4QgkFuMcQ20mAxi1flM="
|
||||
"version": "a132ade95740f9364e477ae8e730eabd650d14cb",
|
||||
"sum": "+5+biGgOmWhNenvUxAtdejDgL3FvdDp6Dv84v3Gdg6A="
|
||||
},
|
||||
{
|
||||
"name": "node-mixin",
|
||||
@@ -83,7 +83,7 @@
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "2cae917bb7e0b6379221e8a24da012b16e63d661",
|
||||
"version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5",
|
||||
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
|
||||
},
|
||||
{
|
||||
@@ -94,8 +94,8 @@
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "31700a05df64c2b4e32bb0ecd8baa25279144778",
|
||||
"sum": "/cohvDTaIiLElG66tKeQsi4v1M9mlGDKjOBSWivL9TU="
|
||||
"version": "431844f0a7c289e4255a68f09a18fcca09637fb2",
|
||||
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
|
||||
},
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
@@ -116,19 +116,8 @@
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "a7ee9d1abe1b1a3670a02ede1135cadb660b9d0c",
|
||||
"version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5",
|
||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||
},
|
||||
{
|
||||
"name": "slo-libsonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/metalmatze/slo-libsonnet",
|
||||
"subdir": "slo-libsonnet"
|
||||
}
|
||||
},
|
||||
"version": "437c402c5f3ad86c3c16db8471f1649284fef0ee",
|
||||
"sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q="
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gIm5hbWVzcGFjZSIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
|
||||
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -86,22 +86,6 @@ rules:
|
||||
- storage.k8s.io
|
||||
resources:
|
||||
- storageclasses
|
||||
- volumeattachments
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- admissionregistration.k8s.io
|
||||
resources:
|
||||
- validatingwebhookconfigurations
|
||||
- mutatingwebhookconfigurations
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- networkpolicies
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
|
||||
@@ -55,7 +55,7 @@ spec:
|
||||
- --port=8081
|
||||
- --telemetry-host=127.0.0.1
|
||||
- --telemetry-port=8082
|
||||
image: quay.io/coreos/kube-state-metrics:v1.9.2
|
||||
image: quay.io/coreos/kube-state-metrics:v1.8.0
|
||||
name: kube-state-metrics
|
||||
resources:
|
||||
limits:
|
||||
|
||||
@@ -20,8 +20,6 @@ spec:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
image: quay.io/prometheus/node-exporter:v0.18.1
|
||||
|
||||
@@ -8,7 +8,7 @@ metadata:
|
||||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 15s
|
||||
interval: 30s
|
||||
port: https
|
||||
relabelings:
|
||||
- action: replace
|
||||
|
||||
@@ -11,7 +11,6 @@ rules:
|
||||
- metrics.k8s.io
|
||||
resources:
|
||||
- pods
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
|
||||
@@ -40,10 +40,10 @@ spec:
|
||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
@@ -68,74 +68,66 @@ spec:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
sum by (namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
|
||||
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
container_memory_working_set_bytes{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
container_memory_rss{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
container_memory_cache{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
container_memory_swap{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
@@ -147,7 +139,7 @@ spec:
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
@@ -157,7 +149,7 @@ spec:
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
@@ -167,7 +159,7 @@ spec:
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (cluster, namespace, workload, pod)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
@@ -220,14 +212,13 @@ spec:
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(min(kube_pod_info) by (cluster, node))
|
||||
- expr: sum(min(kube_pod_info) by (node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
count by (node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
@@ -242,7 +233,7 @@ spec:
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
@@ -710,9 +701,9 @@ spec:
|
||||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
< 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
@@ -725,12 +716,12 @@ spec:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
) < 0.15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -769,156 +760,6 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-error
|
||||
rules:
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: critical
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: warning
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
@@ -1057,7 +898,7 @@ spec:
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1066,7 +907,7 @@ spec:
|
||||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
||||
expr: |
|
||||
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
absent(up{job="kubelet"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1299,10 +1140,13 @@ spec:
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
||||
are out of sync.
|
||||
message: |
|
||||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
|
||||
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
|
||||
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
|
||||
{{ end }}
|
||||
expr: |
|
||||
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
@@ -10,38 +10,6 @@ spec:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
@@ -54,11 +22,6 @@ spec:
|
||||
regex: apiserver_admission_step_admission_latencies_seconds_.*
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- le
|
||||
port: https
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
||||
@@ -9,38 +9,6 @@ spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
|
||||
@@ -10,39 +10,6 @@ spec:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
port: https-metrics
|
||||
relabelings:
|
||||
- sourceLabels:
|
||||
|
||||
@@ -17,7 +17,6 @@ package e2e
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -58,22 +57,23 @@ func testMain(m *testing.M) int {
|
||||
}
|
||||
|
||||
func TestQueryPrometheus(t *testing.T) {
|
||||
t.Parallel()
|
||||
queries := []struct {
|
||||
query string
|
||||
expectN int
|
||||
}{
|
||||
{
|
||||
query: `up{job="node-exporter"} == 1`,
|
||||
expectN: 1,
|
||||
}, {
|
||||
// query: `up{job="node-exporter"} == 1`,
|
||||
// expectN: 1,
|
||||
// }, {
|
||||
// query: `up{job="kubelet"} == 1`,
|
||||
// expectN: 1,
|
||||
// }, {
|
||||
query: `up{job="apiserver"} == 1`,
|
||||
expectN: 1,
|
||||
}, {
|
||||
query: `up{job="kube-state-metrics"} == 1`,
|
||||
expectN: 1,
|
||||
// }, {
|
||||
// query: `up{job="kube-state-metrics"} == 1`,
|
||||
// expectN: 1,
|
||||
}, {
|
||||
query: `up{job="prometheus-k8s"} == 1`,
|
||||
expectN: 1,
|
||||
@@ -116,25 +116,3 @@ func TestQueryPrometheus(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDroppedMetrics(t *testing.T) {
|
||||
// query metadata for all metrics and their metadata
|
||||
md, err := promClient.metadata("{job=~\".+\"}")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, k := range md.Data {
|
||||
// check if the metric' help text contains Deprecated
|
||||
if strings.Contains(k.Help, "Deprecated") {
|
||||
// query prometheus for the Deprecated metric
|
||||
n, err := promClient.query(k.Metric)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if n > 0 {
|
||||
t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,6 @@
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/client-go/kubernetes"
|
||||
|
||||
"github.com/Jeffail/gabs"
|
||||
@@ -54,41 +50,3 @@ func (c *prometheusClient) query(query string) (int, error) {
|
||||
n, err := res.ArrayCountP("data.result")
|
||||
return n, err
|
||||
}
|
||||
|
||||
type Metadata struct {
|
||||
Status string `json:"status,omitempty"`
|
||||
Data []Data `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
type Data struct {
|
||||
Metric string `json:"metric,omitempty"`
|
||||
Help string `json:"help,omitempty"`
|
||||
}
|
||||
|
||||
// metadata makes a request against the Prometheus /api/v1/targets/metadata endpoint.
|
||||
// It returns all the metrics and its metadata.
|
||||
func (c *prometheusClient) metadata(query string) (Metadata, error) {
|
||||
req := c.kubeClient.CoreV1().RESTClient().Get().
|
||||
Namespace("monitoring").
|
||||
Resource("pods").
|
||||
SubResource("proxy").
|
||||
Name("prometheus-k8s-0:9090").
|
||||
Suffix("/api/v1/targets/metadata").Param("match_target", query)
|
||||
|
||||
var data Metadata
|
||||
b, err := req.DoRaw()
|
||||
if err != nil {
|
||||
return data, err
|
||||
}
|
||||
|
||||
r := bytes.NewReader(b)
|
||||
decoder := json.NewDecoder(r)
|
||||
err = decoder.Decode(&data)
|
||||
if err != nil {
|
||||
return data, err
|
||||
}
|
||||
if data.Status != "success" {
|
||||
return data, fmt.Errorf("status of returned response was not successful; status: %s", data.Status)
|
||||
}
|
||||
return data, err
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ run_e2e_tests() {
|
||||
./kind delete cluster
|
||||
}
|
||||
|
||||
cluster_compatible_versions=("v1.16.1" "v1.17.0")
|
||||
cluster_compatible_versions=("v1.14.1" "v1.15.0" "v1.16.1" "v1.17.0")
|
||||
|
||||
for cluster_version in "${cluster_compatible_versions[@]}"
|
||||
do
|
||||
|
||||
Reference in New Issue
Block a user