Added weavenet monitoring setup using kube-prometheus
This commit is contained in:
@@ -656,9 +656,10 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ
|
||||
|
||||
As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
|
||||
|
||||
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
|
||||
- If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
|
||||
- If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md).
|
||||
- If you are using WeaveNet as the CNI, see [weave-net support](docs/weave-net-support.md).
|
||||
|
||||
If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md)
|
||||
#### Authentication problem
|
||||
|
||||
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
|
||||
|
||||
36
docs/weave-net-support.md
Normal file
36
docs/weave-net-support.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Setup WeaveNet CNI monitoring using kube-prometheus
|
||||
[WeaveNet](https://kubernetes.io/docs/concepts/cluster-administration/networking/#weave-net-from-weaveworks) is a resilient and simple to use CNI for Kubernetes. A well monitored and observed CNI helps in troubleshooting Kubernetes networking problems. [WeaveNet](https://www.weave.works/docs/net/latest/concepts/how-it-works/) emits [prometheus metrics](https://www.weave.works/docs/net/latest/tasks/manage/metrics/) for monitoring WeaveNet. There are many ways to install WeaveNet in your cluster. One of them is using [kops](https://github.com/kubernetes/kops/blob/master/docs/networking.md).
|
||||
|
||||
Following this document, you can setup weave net CNI monitoring for your cluster using kube-prometheus.
|
||||
|
||||
## Contents
|
||||
Using kube-prometheus and kubectl you will be able install the following for monitoring weave-net in your cluster:
|
||||
|
||||
1. [Service for WeaveNet](https://gist.github.com/alok87/379c6234b582f555c141f6fddea9fbce) The service which the [service monitor](https://coreos.com/operators/prometheus/docs/latest/user-guides/cluster-monitoring.html) scraps.
|
||||
2. [ServiceMonitor for WeaveNet](https://gist.github.com/alok87/e46a7f9a79ef6d1da6964a035be2cfb9) Service monitor to scraps the weavenet metrics and bring it to Prometheus.
|
||||
3. [Prometheus Alerts for WeaveNet](https://stackoverflow.com/a/60447864) This will setup all the important weave net metrics you should be alerted on.
|
||||
4. [Grafana Dashboard for WeaveNet](https://grafana.com/grafana/dashboards/11789) This will setup the per CNI pod level monitoring for weave net.
|
||||
5. [Grafana Dashboard for WeaveNet(Cluster)](https://grafana.com/grafana/dashboards/11789) This will setup the cluster level monitoring for weave net.
|
||||
|
||||
## Instructions
|
||||
- You can monitor weave-net CNI using kube-prometheus with:
|
||||
[embedmd]:# (../examples/weavenet-example.jsonnet)
|
||||
```jsonnet
|
||||
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
|
||||
(import 'kube-prometheus/kube-prometheus-weavenet.libsonnet');
|
||||
|
||||
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }
|
||||
```
|
||||
|
||||
- After you have the required yamls file please run
|
||||
```
|
||||
kubectl create -f prometheus-serviceWeaveNet.yaml
|
||||
kubectl create -f prometheus-serviceMonitorWeaveNet.yaml
|
||||
kubectl apply -f prometheus-rules.yaml
|
||||
kubectl apply -f grafana-dashboardDefinitions.yaml
|
||||
```
|
||||
9
examples/weavenet-example.jsonnet
Normal file
9
examples/weavenet-example.jsonnet
Normal file
@@ -0,0 +1,9 @@
|
||||
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
|
||||
(import 'kube-prometheus/kube-prometheus-weavenet.libsonnet');
|
||||
|
||||
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
|
||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) }
|
||||
3347
jsonnet/kube-prometheus/grafana-weavenet-cluster.json
Normal file
3347
jsonnet/kube-prometheus/grafana-weavenet-cluster.json
Normal file
File diff suppressed because it is too large
Load Diff
2605
jsonnet/kube-prometheus/grafana-weavenet.json
Normal file
2605
jsonnet/kube-prometheus/grafana-weavenet.json
Normal file
File diff suppressed because it is too large
Load Diff
189
jsonnet/kube-prometheus/kube-prometheus-weavenet.libsonnet
Normal file
189
jsonnet/kube-prometheus/kube-prometheus-weavenet.libsonnet
Normal file
@@ -0,0 +1,189 @@
|
||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local service = k.core.v1.service;
|
||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
|
||||
{
|
||||
prometheus+: {
|
||||
serviceWeaveNet:
|
||||
service.new('weave-net', { 'k8s-app': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'k8s-app': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'k8s-app',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'k8s-app': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'weave-net',
|
||||
rules: [
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNetIPAM has a split brain. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Every node should see same unreachability percentage. Please check and fix why it is not so.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNetIPAM unreachability percentage is above threshold. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find why the unreachability threshold have increased from threshold and fix it. WeaveNet is responsible to keep it under control. Weave rm peer deployment can help clean things.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNet IPAM has pending allocates. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason for IPAM allocates to be in pending state and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNet IPAM has pending claims. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason for IPAM claims to be in pending state and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNet total FastDP flows is below threshold. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason for fast dp flows dropping below the threshold.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'WeaveNet FastDP flows is not happening in some or all nodes. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason for fast dp being off.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason for high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state. Go to the below prometheus link for details.',
|
||||
description: 'Actionable: Find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
grafanaDashboards+:: {
|
||||
'weavenet.json': (import 'grafana-weavenet.json'),
|
||||
'weavenet-cluster.json': (import 'grafana-weavenet-cluster.json'),
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user