kube-prometheus/jsonnet: Use jsonnet-bundler
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,236 @@
|
||||
apiVersion: apiextensions.k8s.io/v1beta1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: servicemonitors.monitoring.coreos.com
|
||||
spec:
|
||||
group: monitoring.coreos.com
|
||||
names:
|
||||
kind: ServiceMonitor
|
||||
plural: servicemonitors
|
||||
scope: Namespaced
|
||||
validation:
|
||||
openAPIV3Schema:
|
||||
description: ServiceMonitor defines monitoring for a set of services.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: 'APIVersion defines the versioned schema of this representation
|
||||
of an object. Servers should convert recognized schemas to the latest
|
||||
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
|
||||
type: string
|
||||
kind:
|
||||
description: 'Kind is a string value representing the REST resource this
|
||||
object represents. Servers may infer this from the endpoint the client
|
||||
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
|
||||
type: string
|
||||
spec:
|
||||
description: ServiceMonitorSpec contains specification parameters for a
|
||||
ServiceMonitor.
|
||||
properties:
|
||||
endpoints:
|
||||
description: A list of endpoints allowed as part of this ServiceMonitor.
|
||||
items:
|
||||
description: Endpoint defines a scrapeable endpoint serving Prometheus
|
||||
metrics.
|
||||
properties:
|
||||
basicAuth:
|
||||
description: 'BasicAuth allow an endpoint to authenticate over
|
||||
basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints'
|
||||
properties:
|
||||
password:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or it's key must
|
||||
be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
username:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or it's key must
|
||||
be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
bearerTokenFile:
|
||||
description: File to read bearer token for scraping targets.
|
||||
type: string
|
||||
honorLabels:
|
||||
description: HonorLabels chooses the metric's labels on collisions
|
||||
with target labels.
|
||||
type: boolean
|
||||
interval:
|
||||
description: Interval at which metrics should be scraped
|
||||
type: string
|
||||
metricRelabelings:
|
||||
description: MetricRelabelConfigs to apply to samples before ingestion.
|
||||
items:
|
||||
description: 'RelabelConfig allows dynamic rewriting of the
|
||||
label set, being applied to samples before ingestion. It defines
|
||||
`<metric_relabel_configs>`-section of Prometheus configuration.
|
||||
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs'
|
||||
properties:
|
||||
action:
|
||||
description: Action to perform based on regex matching.
|
||||
Default is 'replace'
|
||||
type: string
|
||||
modulus:
|
||||
description: Modulus to take of the hash of the source label
|
||||
values.
|
||||
format: int64
|
||||
type: integer
|
||||
regex:
|
||||
description: Regular expression against which the extracted
|
||||
value is matched. defailt is '(.*)'
|
||||
type: string
|
||||
replacement:
|
||||
description: Replacement value against which a regex replace
|
||||
is performed if the regular expression matches. Regex
|
||||
capture groups are available. Default is '$1'
|
||||
type: string
|
||||
separator:
|
||||
description: Separator placed between concatenated source
|
||||
label values. default is ';'.
|
||||
type: string
|
||||
sourceLabels:
|
||||
description: The source labels select values from existing
|
||||
labels. Their content is concatenated using the configured
|
||||
separator and matched against the configured regular expression
|
||||
for the replace, keep, and drop actions.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
targetLabel:
|
||||
description: Label to which the resulting value is written
|
||||
in a replace action. It is mandatory for replace actions.
|
||||
Regex capture groups are available.
|
||||
type: string
|
||||
type: array
|
||||
params:
|
||||
description: Optional HTTP URL parameters
|
||||
type: object
|
||||
path:
|
||||
description: HTTP path to scrape for metrics.
|
||||
type: string
|
||||
port:
|
||||
description: Name of the service port this endpoint refers to.
|
||||
Mutually exclusive with targetPort.
|
||||
type: string
|
||||
scheme:
|
||||
description: HTTP scheme to use for scraping.
|
||||
type: string
|
||||
scrapeTimeout:
|
||||
description: Timeout after which the scrape is ended
|
||||
type: string
|
||||
targetPort: {}
|
||||
tlsConfig:
|
||||
description: TLSConfig specifies TLS configuration parameters.
|
||||
properties:
|
||||
caFile:
|
||||
description: The CA cert to use for the targets.
|
||||
type: string
|
||||
certFile:
|
||||
description: The client cert file for the targets.
|
||||
type: string
|
||||
insecureSkipVerify:
|
||||
description: Disable target certificate validation.
|
||||
type: boolean
|
||||
keyFile:
|
||||
description: The client key file for the targets.
|
||||
type: string
|
||||
serverName:
|
||||
description: Used to verify the hostname for the targets.
|
||||
type: string
|
||||
type: array
|
||||
jobLabel:
|
||||
description: The label to use to retrieve the job name from.
|
||||
type: string
|
||||
namespaceSelector:
|
||||
description: A selector for selecting namespaces either selecting all
|
||||
namespaces or a list of namespaces.
|
||||
properties:
|
||||
any:
|
||||
description: Boolean describing whether all namespaces are selected
|
||||
in contrast to a list restricting them.
|
||||
type: boolean
|
||||
matchNames:
|
||||
description: List of namespace names.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
selector:
|
||||
description: A label selector is a label query over a set of resources.
|
||||
The result of matchLabels and matchExpressions are ANDed. An empty
|
||||
label selector matches all objects. A null label selector matches
|
||||
no objects.
|
||||
properties:
|
||||
matchExpressions:
|
||||
description: matchExpressions is a list of label selector requirements.
|
||||
The requirements are ANDed.
|
||||
items:
|
||||
description: A label selector requirement is a selector that contains
|
||||
values, a key, and an operator that relates the key and values.
|
||||
properties:
|
||||
key:
|
||||
description: key is the label key that the selector applies
|
||||
to.
|
||||
type: string
|
||||
operator:
|
||||
description: operator represents a key's relationship to a
|
||||
set of values. Valid operators are In, NotIn, Exists and
|
||||
DoesNotExist.
|
||||
type: string
|
||||
values:
|
||||
description: values is an array of string values. If the operator
|
||||
is In or NotIn, the values array must be non-empty. If the
|
||||
operator is Exists or DoesNotExist, the values array must
|
||||
be empty. This array is replaced during a strategic merge
|
||||
patch.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- key
|
||||
- operator
|
||||
type: array
|
||||
matchLabels:
|
||||
description: matchLabels is a map of {key,value} pairs. A single
|
||||
{key,value} in the matchLabels map is equivalent to an element
|
||||
of matchExpressions, whose key field is "key", the operator is
|
||||
"In", and the values array contains only "value". The requirements
|
||||
are ANDed.
|
||||
type: object
|
||||
targetLabels:
|
||||
description: TargetLabels transfers labels on the Kubernetes Service
|
||||
onto the target.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- endpoints
|
||||
- selector
|
||||
required:
|
||||
- spec
|
||||
version: v1
|
||||
status:
|
||||
acceptedNames:
|
||||
kind: ""
|
||||
plural: ""
|
||||
conditions: null
|
||||
@@ -31,6 +31,8 @@ spec:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
@@ -1,9 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
@@ -6,6 +6,9 @@ metadata:
|
||||
name: main
|
||||
namespace: monitoring
|
||||
spec:
|
||||
baseImage: quay.io/prometheus/alertmanager
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
replicas: 3
|
||||
serviceAccountName: alertmanager-main
|
||||
version: v0.14.0
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
namespace: monitoring
|
||||
type: Opaque
|
||||
8
manifests/alertmanager-secret.yaml
Normal file
8
manifests/alertmanager-secret.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo=
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
namespace: monitoring
|
||||
type: Opaque
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,7 +16,7 @@ spec:
|
||||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- image: quay.io/coreos/monitoring-grafana:5.0.3
|
||||
- image: grafana/grafana:5.1.0
|
||||
name: grafana
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
@@ -29,13 +29,13 @@ spec:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
volumeMounts:
|
||||
- mountPath: /data
|
||||
- mountPath: /var/lib/grafana
|
||||
name: grafana-storage
|
||||
readOnly: false
|
||||
- mountPath: /grafana/conf/provisioning/datasources
|
||||
- mountPath: /etc/grafana/provisioning/datasources
|
||||
name: grafana-datasources
|
||||
readOnly: false
|
||||
- mountPath: /grafana/conf/provisioning/dashboards
|
||||
- mountPath: /etc/grafana/provisioning/dashboards
|
||||
name: grafana-dashboards
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0
|
||||
@@ -89,6 +89,8 @@ spec:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 30Mi
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
@@ -6,6 +6,7 @@ metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: https-main
|
||||
port: 8443
|
||||
@@ -50,10 +50,15 @@ spec:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 20Mi
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
serviceAccountName: node-exporter
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
key: node-role.kubernetes.io/master
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /proc
|
||||
@@ -6,6 +6,7 @@ metadata:
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: https
|
||||
port: 9100
|
||||
@@ -1,590 +0,0 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.rules.yaml: |
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
summary: Configuration out of sync
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
summary: Alertmanager down or missing
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Alertmanager's configuration reload failed
|
||||
etcd3.rules.yaml: |
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
||||
general.rules.yaml: |
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
||||
kube-controller-manager.rules.yaml: |
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
kube-scheduler.rules.yaml: |
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
||||
kube-state-metrics.rules.yaml: |
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{$labels.namespace}}/{{$labels.deployment}}
|
||||
summary: Deployment is outdated
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}}
|
||||
summary: Deployment replicas are outdated
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
||||
set {{$labels.namespace}}/{{$labels.daemonset}}
|
||||
summary: DaemonSet is missing pods
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}}
|
||||
times within the last hour
|
||||
summary: Pod is restarting frequently
|
||||
kubelet.rules.yaml: |
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
summary: Prometheus failed to scrape
|
||||
- alert: K8SKubeletDown
|
||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||
* 100 > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
kubernetes.rules.yaml: |
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
summary: API server high latency
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
summary: API server high latency
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
summary: API server request errors
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
summary: No API servers are reachable
|
||||
|
||||
- alert: K8sCertificateExpirationNotice
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
||||
summary: Kubernetes API Certificate is expiering soon
|
||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
||||
|
||||
- alert: K8sCertificateExpirationNotice
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Kubernetes API Certificate is expiring in less than 1 day
|
||||
summary: Kubernetes API Certificate is expiering
|
||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
||||
node.rules.yaml: |
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
summary: Prometheus could not scrape a node-exporter
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
summary: Node disk is running full within 24 hours
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||
summary: Node disk is running full within 2 hours
|
||||
prometheus.rules.yaml: "groups:\n- name: prometheus.rules\n rules:\n - alert:
|
||||
PrometheusConfigReloadFailed\n expr: prometheus_config_last_reload_successful
|
||||
== 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
|
||||
Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}\n
|
||||
\ summary: Reloading Promehteus' configuration failed\n\n - alert: PrometheusNotificationQueueRunningFull\n
|
||||
\ expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) >
|
||||
prometheus_notifications_queue_capacity\n for: 10m\n labels:\n severity:
|
||||
warning\n annotations:\n description: Prometheus' alert notification queue
|
||||
is running full for {{$labels.namespace}}/{{\n $labels.pod}}\n summary:
|
||||
Prometheus' alert notification queue is running full \n\n - alert: PrometheusErrorSendingAlerts\n
|
||||
\ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n
|
||||
\ > 0.01\n for: 10m\n labels:\n severity: warning\n annotations:\n
|
||||
\ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n
|
||||
\ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary:
|
||||
Errors while sending alert from Prometheus\n\n - alert: PrometheusErrorSendingAlerts\n
|
||||
\ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])\n
|
||||
\ > 0.03\n for: 10m\n labels:\n severity: critical\n annotations:\n
|
||||
\ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{\n
|
||||
\ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\n summary:
|
||||
Errors while sending alerts from Prometheus\n\n - alert: PrometheusNotConnectedToAlertmanagers\n
|
||||
\ expr: prometheus_notifications_alertmanagers_discovered < 1\n for: 10m\n
|
||||
\ labels:\n severity: warning\n annotations:\n description: Prometheus
|
||||
{{ $labels.namespace }}/{{ $labels.pod}} is not connected\n to any Alertmanagers\n
|
||||
\ summary: Prometheus is not connected to any Alertmanagers\n\n - alert:
|
||||
PrometheusTSDBReloadsFailing\n expr: increase(prometheus_tsdb_reloads_failures_total[2h])
|
||||
> 0\n for: 12h\n labels:\n severity: warning\n annotations:\n description:
|
||||
'{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}\n reload
|
||||
failures over the last four hours.'\n summary: Prometheus has issues reloading
|
||||
data blocks from disk\n\n - alert: PrometheusTSDBCompactionsFailing\n expr:
|
||||
increase(prometheus_tsdb_compactions_failed_total[2h]) > 0\n for: 12h\n labels:\n
|
||||
\ severity: warning\n annotations:\n description: '{{$labels.job}}
|
||||
at {{$labels.instance}} had {{$value | humanize}}\n compaction failures
|
||||
over the last four hours.'\n summary: Prometheus has issues compacting sample
|
||||
blocks\n\n - alert: PrometheusTSDBWALCorruptions\n expr: tsdb_wal_corruptions_total
|
||||
> 0\n for: 4h\n labels:\n severity: warning\n annotations:\n description:
|
||||
'{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead\n log
|
||||
(WAL).'\n summary: Prometheus write-ahead log is corrupted\n\n - alert:
|
||||
PrometheusNotIngestingSamples\n expr: rate(prometheus_tsdb_head_samples_appended_total[5m])
|
||||
<= 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
|
||||
\"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.\"\n
|
||||
\ summary: \"Prometheus isn't ingesting samples\"\n\n - alert: PrometheusTargetScapesDuplicate\n
|
||||
\ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
|
||||
> 0\n for: 10m\n labels:\n severity: warning\n annotations:\n description:
|
||||
\"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate
|
||||
timestamps but different values\"\n summary: Prometheus has many samples
|
||||
rejected\n"
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: prometheus-k8s-rules
|
||||
namespace: monitoring
|
||||
@@ -11,6 +11,9 @@ spec:
|
||||
- name: alertmanager-main
|
||||
namespace: monitoring
|
||||
port: web
|
||||
baseImage: quay.io/prometheus/prometheus
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
replicas: 2
|
||||
resources:
|
||||
requests:
|
||||
166
manifests/prometheus-rules.yaml
Normal file
166
manifests/prometheus-rules.yaml
Normal file
@@ -0,0 +1,166 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\":
|
||||
|\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m]))
|
||||
by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n
|
||||
\ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"})
|
||||
by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n
|
||||
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\",
|
||||
image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name)
|
||||
group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
||||
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n
|
||||
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"})
|
||||
by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n
|
||||
\ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\",
|
||||
\"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n
|
||||
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"})
|
||||
by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
||||
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n
|
||||
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"})
|
||||
by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
||||
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n-
|
||||
\"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info)
|
||||
by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n
|
||||
\ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\",
|
||||
\"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n
|
||||
\ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1
|
||||
- avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\":
|
||||
\":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n
|
||||
\ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace,
|
||||
pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\":
|
||||
\"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n
|
||||
\ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n
|
||||
\ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n
|
||||
\ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"}
|
||||
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
|
||||
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\":
|
||||
\":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"}
|
||||
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\":
|
||||
|\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n
|
||||
\ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n
|
||||
\ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\":
|
||||
\"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
|
||||
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\":
|
||||
\":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum
|
||||
by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"}
|
||||
+ node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n
|
||||
\ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node)
|
||||
(\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace,
|
||||
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
|
||||
\"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum
|
||||
/ node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n
|
||||
\ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
|
||||
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace,
|
||||
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
|
||||
\"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
||||
/ 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n
|
||||
\ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
||||
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\":
|
||||
|\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
||||
/ 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n
|
||||
\ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
||||
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\":
|
||||
|\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))
|
||||
+\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
||||
\ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum
|
||||
by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])
|
||||
+\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\":
|
||||
|\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))
|
||||
+\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
||||
\ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum
|
||||
by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])
|
||||
+\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-apps\"\n
|
||||
\ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n
|
||||
\ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n
|
||||
\ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m])
|
||||
> 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n
|
||||
\ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\":
|
||||
\"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n
|
||||
\ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",
|
||||
phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
|
||||
\"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\":
|
||||
\n \"message\": \"Deployment {{ $labels.namespace }}/{{ labels.deployment
|
||||
}} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n
|
||||
\ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n
|
||||
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n -
|
||||
\"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\":
|
||||
\"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n
|
||||
\ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n
|
||||
\ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n
|
||||
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\":
|
||||
\"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n
|
||||
\ \"annotations\": \n \"message\": \"Overcommited CPU resource requests
|
||||
on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n
|
||||
\ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1)
|
||||
/ count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
|
||||
\"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
|
||||
\"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n
|
||||
\ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n
|
||||
\ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n
|
||||
\ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\":
|
||||
\n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\":
|
||||
\n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n
|
||||
\ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\",
|
||||
resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n >
|
||||
1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
|
||||
\"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n
|
||||
\ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n
|
||||
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n
|
||||
\ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\":
|
||||
\"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\"
|
||||
$value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n
|
||||
\ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n
|
||||
\ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\",
|
||||
type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
||||
\"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\":
|
||||
\"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\":
|
||||
\"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace
|
||||
{{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\":
|
||||
|\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n
|
||||
\ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\":
|
||||
\"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n
|
||||
\ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent
|
||||
volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace
|
||||
}} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h],
|
||||
4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
|
||||
\"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\":
|
||||
\"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node
|
||||
}} has been unready for more than an hour\"\n \"expr\": |\n max(kube_node_status_ready{job=\"kube-state-metrics\",
|
||||
condition=\"false\"} == 1) BY (node)\n \"for\": \"1h\"\n \"labels\": \n
|
||||
\ \"severity\": \"warning\"\n - \"alert\": \"KubeVersionMismatch\"\n \"annotations\":
|
||||
\n \"message\": \"There are {{ $value }} different versions of Kubernetes
|
||||
components running.\"\n \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"})
|
||||
by (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
|
||||
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
|
||||
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
|
||||
{{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m]))
|
||||
by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m]))
|
||||
by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
||||
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
|
||||
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
|
||||
{{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m]))
|
||||
by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
||||
\"warning\""
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: prometheus-k8s-rules
|
||||
namespace: monitoring
|
||||
Reference in New Issue
Block a user