116 lines
6.2 KiB
YAML
116 lines
6.2 KiB
YAML
groups:
|
|
- name: ./kubernetes.rules
|
|
rules:
|
|
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
|
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
|
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
container_name)
|
|
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
|
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
|
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
|
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
controller, pod_name, container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
|
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
|
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
container_name)
|
|
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
|
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
|
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
container_name)
|
|
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
|
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
controller, pod_name, container_name, scope, type)
|
|
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
|
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
|
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
controller, pod_name, container_name, scope, type)
|
|
- record: cluster:memory_allocation:percent
|
|
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
|
/ sum(machine_memory_bytes) BY (cluster)
|
|
- record: cluster:memory_used:percent
|
|
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
|
BY (cluster)
|
|
- record: cluster:cpu_allocation:percent
|
|
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
|
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
|
- record: cluster:node_cpu_use:percent
|
|
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
|
BY (cluster)
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
|
cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
|
cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
|
cluster, job, resource, verb)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
|
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|