kube-prometheus/jsonnet: Use jsonnet-bundler

This commit is contained in:
Frederic Branczyk
2018-04-25 15:04:20 +01:00
parent 992bcdccc1
commit edf21e4382
141 changed files with 11505 additions and 7219 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,236 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
creationTimestamp: null
name: servicemonitors.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
kind: ServiceMonitor
plural: servicemonitors
scope: Namespaced
validation:
openAPIV3Schema:
description: ServiceMonitor defines monitoring for a set of services.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
type: string
spec:
description: ServiceMonitorSpec contains specification parameters for a
ServiceMonitor.
properties:
endpoints:
description: A list of endpoints allowed as part of this ServiceMonitor.
items:
description: Endpoint defines a scrapeable endpoint serving Prometheus
metrics.
properties:
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate over
basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints'
properties:
password:
description: SecretKeySelector selects a key of a Secret.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
type: string
optional:
description: Specify whether the Secret or it's key must
be defined
type: boolean
required:
- key
username:
description: SecretKeySelector selects a key of a Secret.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
type: string
optional:
description: Specify whether the Secret or it's key must
be defined
type: boolean
required:
- key
bearerTokenFile:
description: File to read bearer token for scraping targets.
type: string
honorLabels:
description: HonorLabels chooses the metric's labels on collisions
with target labels.
type: boolean
interval:
description: Interval at which metrics should be scraped
type: string
metricRelabelings:
description: MetricRelabelConfigs to apply to samples before ingestion.
items:
description: 'RelabelConfig allows dynamic rewriting of the
label set, being applied to samples before ingestion. It defines
`<metric_relabel_configs>`-section of Prometheus configuration.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs'
properties:
action:
description: Action to perform based on regex matching.
Default is 'replace'
type: string
modulus:
description: Modulus to take of the hash of the source label
values.
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched. defailt is '(.*)'
type: string
replacement:
description: Replacement value against which a regex replace
is performed if the regular expression matches. Regex
capture groups are available. Default is '$1'
type: string
separator:
description: Separator placed between concatenated source
label values. default is ';'.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
separator and matched against the configured regular expression
for the replace, keep, and drop actions.
items:
type: string
type: array
targetLabel:
description: Label to which the resulting value is written
in a replace action. It is mandatory for replace actions.
Regex capture groups are available.
type: string
type: array
params:
description: Optional HTTP URL parameters
type: object
path:
description: HTTP path to scrape for metrics.
type: string
port:
description: Name of the service port this endpoint refers to.
Mutually exclusive with targetPort.
type: string
scheme:
description: HTTP scheme to use for scraping.
type: string
scrapeTimeout:
description: Timeout after which the scrape is ended
type: string
targetPort: {}
tlsConfig:
description: TLSConfig specifies TLS configuration parameters.
properties:
caFile:
description: The CA cert to use for the targets.
type: string
certFile:
description: The client cert file for the targets.
type: string
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keyFile:
description: The client key file for the targets.
type: string
serverName:
description: Used to verify the hostname for the targets.
type: string
type: array
jobLabel:
description: The label to use to retrieve the job name from.
type: string
namespaceSelector:
description: A selector for selecting namespaces either selecting all
namespaces or a list of namespaces.
properties:
any:
description: Boolean describing whether all namespaces are selected
in contrast to a list restricting them.
type: boolean
matchNames:
description: List of namespace names.
items:
type: string
type: array
selector:
description: A label selector is a label query over a set of resources.
The result of matchLabels and matchExpressions are ANDed. An empty
label selector matches all objects. A null label selector matches
no objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that contains
values, a key, and an operator that relates the key and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship to a
set of values. Valid operators are In, NotIn, Exists and
DoesNotExist.
type: string
values:
description: values is an array of string values. If the operator
is In or NotIn, the values array must be non-empty. If the
operator is Exists or DoesNotExist, the values array must
be empty. This array is replaced during a strategic merge
patch.
items:
type: string
type: array
required:
- key
- operator
type: array
matchLabels:
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator is
"In", and the values array contains only "value". The requirements
are ANDed.
type: object
targetLabels:
description: TargetLabels transfers labels on the Kubernetes Service
onto the target.
items:
type: string
type: array
required:
- endpoints
- selector
required:
- spec
version: v1
status:
acceptedNames:
kind: ""
plural: ""
conditions: null

View File

@@ -31,6 +31,8 @@ spec:
requests:
cpu: 100m
memory: 50Mi
nodeSelector:
beta.kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534

View File

@@ -1,9 +1,12 @@
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: prometheus-operator
name: prometheus-operator
namespace: monitoring
spec:
clusterIP: None
ports:
- name: http
port: 8080

View File

@@ -6,6 +6,9 @@ metadata:
name: main
namespace: monitoring
spec:
baseImage: quay.io/prometheus/alertmanager
nodeSelector:
beta.kubernetes.io/os: linux
replicas: 3
serviceAccountName: alertmanager-main
version: v0.14.0

View File

@@ -1,8 +0,0 @@
apiVersion: v1
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
type: Opaque

View File

@@ -0,0 +1,8 @@
apiVersion: v1
data:
alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo=
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
type: Opaque

View File

@@ -16,7 +16,7 @@ spec:
app: grafana
spec:
containers:
- image: quay.io/coreos/monitoring-grafana:5.0.3
- image: grafana/grafana:5.1.0
name: grafana
ports:
- containerPort: 3000
@@ -29,13 +29,13 @@ spec:
cpu: 100m
memory: 100Mi
volumeMounts:
- mountPath: /data
- mountPath: /var/lib/grafana
name: grafana-storage
readOnly: false
- mountPath: /grafana/conf/provisioning/datasources
- mountPath: /etc/grafana/provisioning/datasources
name: grafana-datasources
readOnly: false
- mountPath: /grafana/conf/provisioning/dashboards
- mountPath: /etc/grafana/provisioning/dashboards
name: grafana-dashboards
readOnly: false
- mountPath: /grafana-dashboard-definitions/0

View File

@@ -89,6 +89,8 @@ spec:
requests:
cpu: 10m
memory: 30Mi
nodeSelector:
beta.kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534

View File

@@ -6,6 +6,7 @@ metadata:
name: kube-state-metrics
namespace: monitoring
spec:
clusterIP: None
ports:
- name: https-main
port: 8443

View File

@@ -50,10 +50,15 @@ spec:
requests:
cpu: 10m
memory: 20Mi
nodeSelector:
beta.kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- hostPath:
path: /proc

View File

@@ -6,6 +6,7 @@ metadata:
name: node-exporter
namespace: monitoring
spec:
clusterIP: None
ports:
- name: https
port: 9100

View File

@@ -1,590 +0,0 @@
apiVersion: v1
data:
alertmanager.rules.yaml: |
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync.
summary: Configuration out of sync
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
summary: Alertmanager down or missing
- alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
summary: Alertmanager's configuration reload failed
etcd3.rules.yaml: |
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
general.rules.yaml: |
groups:
- name: general.rules
rules:
- alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
summary: Targets are down
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity: none
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- record: fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted
kube-controller-manager.rules.yaml: |
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kube-scheduler.rules.yaml: |
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
kube-state-metrics.rules.yaml: |
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{$labels.namespace}}/{{$labels.deployment}}
summary: Deployment is outdated
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}}
summary: Deployment replicas are outdated
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemon
set {{$labels.namespace}}/{{$labels.daemonset}}
summary: DaemonSet is missing pods
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}}
times within the last hour
summary: Pod is restarting frequently
kubelet.rules.yaml: |
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Prometheus failed to scrape
- alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 10
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
kubernetes.rules.yaml: |
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
summary: API server high latency
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
summary: API server request errors
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
summary: No API servers are reachable
- alert: K8sCertificateExpirationNotice
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
summary: Kubernetes API Certificate is expiering soon
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
- alert: K8sCertificateExpirationNotice
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
summary: Kubernetes API Certificate is expiering
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
node.rules.yaml: |
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
summary: Prometheus could not scrape a node-exporter
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 24 hours (mounted at {{$labels.mountpoint}})
summary: Node disk is running full within 24 hours
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}})
summary: Node disk is running full within 2 hours
prometheus.rules.yaml: "groups:\n- name: prometheus.rules\n rules:\n - alert:
PrometheusConfigReloadFailed\n expr: prometheus_config_last_reload_successful
== 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}\n
\ summary: Reloading Promehteus' configuration failed\n\n - alert: PrometheusNotificationQueueRunningFull\n
\ expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) >
prometheus_notifications_queue_capacity\n for: 10m\n labels:\n severity:
warning\n annotations:\n description: Prometheus' alert notification queue
is running full for {{$labels.namespace}}/{{\n $labels.pod}}\n summary:
Prometheus' alert notification queue is running full \n\n - alert: PrometheusErrorSendingAlerts\n
\ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n
\ > 0.01\n for: 10m\n labels:\n severity: warning\n annotations:\n
\ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n
\ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary:
Errors while sending alert from Prometheus\n\n - alert: PrometheusErrorSendingAlerts\n
\ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n
\ > 0.03\n for: 10m\n labels:\n severity: critical\n annotations:\n
\ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n
\ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary:
Errors while sending alerts from Prometheus\n\n - alert: PrometheusNotConnectedToAlertmanagers\n
\ expr: prometheus_notifications_alertmanagers_discovered < 1\n for: 10m\n
\ labels:\n severity: warning\n annotations:\n description: Prometheus
{{ $labels.namespace }}/{{ $labels.pod}} is not connected\n to any Alertmanagers\n
\ summary: Prometheus is not connected to any Alertmanagers\n\n - alert:
PrometheusTSDBReloadsFailing\n expr: increase(prometheus_tsdb_reloads_failures_total[2h])
> 0\n for: 12h\n labels:\n severity: warning\n annotations:\n description:
'{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}\n reload
failures over the last four hours.'\n summary: Prometheus has issues reloading
data blocks from disk\n\n - alert: PrometheusTSDBCompactionsFailing\n expr:
increase(prometheus_tsdb_compactions_failed_total[2h]) > 0\n for: 12h\n labels:\n
\ severity: warning\n annotations:\n description: '{{$labels.job}}
at {{$labels.instance}} had {{$value | humanize}}\n compaction failures
over the last four hours.'\n summary: Prometheus has issues compacting sample
blocks\n\n - alert: PrometheusTSDBWALCorruptions\n expr: tsdb_wal_corruptions_total
> 0\n for: 4h\n labels:\n severity: warning\n annotations:\n description:
'{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead\n log
(WAL).'\n summary: Prometheus write-ahead log is corrupted\n\n - alert:
PrometheusNotIngestingSamples\n expr: rate(prometheus_tsdb_head_samples_appended_total[5m])
<= 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
\"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.\"\n
\ summary: \"Prometheus isn't ingesting samples\"\n\n - alert: PrometheusTargetScapesDuplicate\n
\ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
\"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate
timestamps but different values\"\n summary: Prometheus has many samples
rejected\n"
kind: ConfigMap
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-k8s-rules
namespace: monitoring

View File

@@ -11,6 +11,9 @@ spec:
- name: alertmanager-main
namespace: monitoring
port: web
baseImage: quay.io/prometheus/prometheus
nodeSelector:
beta.kubernetes.io/os: linux
replicas: 2
resources:
requests:

View File

@@ -0,0 +1,166 @@
apiVersion: v1
data:
all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\":
|\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m]))
by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n
\ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"})
by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\",
image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name)
group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"})
by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n
\ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\",
\"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"})
by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"})
by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n-
\"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info)
by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n
\ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\",
\"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n
\ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1
- avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\":
\":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n
\ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace,
pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\":
\"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n
\ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n
\ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n
\ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"}
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\":
\":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"}
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\":
|\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n
\ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n
\ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\":
\"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\":
\":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum
by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"}
+ node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n
\ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node)
(\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace,
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
\"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum
/ node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n
\ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace,
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
\"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
/ 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n
\ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\":
|\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
/ 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n
\ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\":
|\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))
+\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
\ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum
by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])
+\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\":
|\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))
+\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
\ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum
by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])
+\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
\ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-apps\"\n
\ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n
\ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n
\ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m])
> 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n
\ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\":
\"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n
\ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",
phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
\"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\":
\n \"message\": \"Deployment {{ $labels.namespace }}/{{ labels.deployment
}} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n
\ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n -
\"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\":
\"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n
\ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n
\ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\":
\"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n
\ \"annotations\": \n \"message\": \"Overcommited CPU resource requests
on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n
\ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1)
/ count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
\"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
\"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n
\ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n
\ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n
\ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\":
\n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\":
\n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n
\ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\",
resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n >
1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n
\ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
\"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n
\ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n
\ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\":
\"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\"
$value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n
\ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n
\ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\",
type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
\"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\":
\"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\":
\"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace
{{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\":
|\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n
\ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\":
\"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n
\ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent
volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace
}} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h],
4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
\"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\":
\"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node
}} has been unready for more than an hour\"\n \"expr\": |\n max(kube_node_status_ready{job=\"kube-state-metrics\",
condition=\"false\"} == 1) BY (node)\n \"for\": \"1h\"\n \"labels\": \n
\ \"severity\": \"warning\"\n - \"alert\": \"KubeVersionMismatch\"\n \"annotations\":
\n \"message\": \"There are {{ $value }} different versions of Kubernetes
components running.\"\n \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"})
by (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
{{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m]))
by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m]))
by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
{{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m]))
by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
\"warning\""
kind: ConfigMap
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-k8s-rules
namespace: monitoring