172 lines
5.9 KiB
Plaintext
172 lines
5.9 KiB
Plaintext
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
|
|
|
### Container resources ###
|
|
|
|
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_spec_memory_limit_bytes{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_spec_cpu_shares{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
irate(
|
|
container_cpu_usage_seconds_total{container_name!=""}[5m]
|
|
),
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_memory_usage_bytes{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_memory_working_set_bytes{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_memory_rss{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_memory_cache{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
|
sum by (cluster,namespace,controller,pod_name,container_name) (
|
|
label_replace(
|
|
container_disk_usage_bytes{container_name!=""},
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
|
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
|
label_replace(
|
|
irate(
|
|
container_memory_failures_total{container_name!=""}[5m]
|
|
),
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
cluster_namespace_controller_pod_container:memory_oom:rate =
|
|
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
|
label_replace(
|
|
irate(
|
|
container_memory_failcnt{container_name!=""}[5m]
|
|
),
|
|
"controller", "$1",
|
|
"pod_name", "^(.*)-[a-z0-9]+"
|
|
)
|
|
)
|
|
|
|
### Cluster resources ###
|
|
|
|
cluster:memory_allocation:percent =
|
|
100 * sum by (cluster) (
|
|
container_spec_memory_limit_bytes{pod_name!=""}
|
|
) / sum by (cluster) (
|
|
machine_memory_bytes
|
|
)
|
|
|
|
cluster:memory_used:percent =
|
|
100 * sum by (cluster) (
|
|
container_memory_usage_bytes{pod_name!=""}
|
|
) / sum by (cluster) (
|
|
machine_memory_bytes
|
|
)
|
|
|
|
cluster:cpu_allocation:percent =
|
|
100 * sum by (cluster) (
|
|
container_spec_cpu_shares{pod_name!=""}
|
|
) / sum by (cluster) (
|
|
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
|
)
|
|
|
|
cluster:node_cpu_use:percent =
|
|
100 * sum by (cluster) (
|
|
rate(node_cpu{mode!="idle"}[5m])
|
|
) / sum by (cluster) (
|
|
machine_cpu_cores
|
|
)
|
|
|
|
### API latency ###
|
|
|
|
# Raw metrics are in microseconds. Convert to seconds.
|
|
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
|
histogram_quantile(
|
|
0.99,
|
|
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
) / 1e6
|
|
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
|
histogram_quantile(
|
|
0.9,
|
|
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
) / 1e6
|
|
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
|
histogram_quantile(
|
|
0.5,
|
|
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
|
) / 1e6
|
|
|
|
### Scheduling latency ###
|
|
|
|
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
|
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
|
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
|
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
|
|
|
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
|
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
|
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
|
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
|
|
|
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
|
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
|
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
|
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|