Merge pull request #425 from alok87/weave-net

Weave Net Monitoring setup using kube-prometheus
This commit is contained in:
Frederic Branczyk
2020-03-04 20:20:16 +01:00
committed by GitHub
6 changed files with 6253 additions and 5 deletions

View File

@@ -656,9 +656,10 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ
As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
- If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md).
- If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md).
- If you are using Weave Net, see [Weave Net support](docs/weave-net-support.md).
If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md)
#### Authentication problem
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.

67
docs/weave-net-support.md Normal file
View File

@@ -0,0 +1,67 @@
# Setup Weave Net monitoring using kube-prometheus
[Weave Net](https://kubernetes.io/docs/concepts/cluster-administration/networking/#weave-net-from-weaveworks) is a resilient and simple to use CNI provider for Kubernetes. A well monitored and observed CNI provider helps in troubleshooting Kubernetes networking problems. [Weave Net](https://www.weave.works/docs/net/latest/concepts/how-it-works/) emits [prometheus metrics](https://www.weave.works/docs/net/latest/tasks/manage/metrics/) for monitoring Weave Net. There are many ways to install Weave Net in your cluster. One of them is using [kops](https://github.com/kubernetes/kops/blob/master/docs/networking.md).
Following this document, you can setup Weave Net monitoring for your cluster using kube-prometheus.
## Contents
Using kube-prometheus and kubectl you will be able install the following for monitoring Weave Net in your cluster:
1. [Service for Weave Net](https://gist.github.com/alok87/379c6234b582f555c141f6fddea9fbce) The service which the [service monitor](https://coreos.com/operators/prometheus/docs/latest/user-guides/cluster-monitoring.html) scrapes.
2. [ServiceMonitor for Weave Net](https://gist.github.com/alok87/e46a7f9a79ef6d1da6964a035be2cfb9) Service monitor to scrape the Weave Net metrics and bring it to Prometheus.
3. [Prometheus Alerts for Weave Net](https://stackoverflow.com/a/60447864) This will setup all the important Weave Net metrics you should be alerted on.
4. [Grafana Dashboard for Weave Net](https://grafana.com/grafana/dashboards/11789) This will setup the per Weave Net pod level monitoring for Weave Net.
5. [Grafana Dashboard for Weave Net(Cluster)](https://grafana.com/grafana/dashboards/11789) This will setup the cluster level monitoring for Weave Net.
## Instructions
- You can monitor Weave Net using an example like below. **Please note that some alert configurations are environment specific and may require modifications of alert thresholds**. For example: The FastDP flows have never gone below 15000 for us. But if this value is say 20000 for you then you can use an example like below to update the alert. The alerts which may require threshold modifications are `WeaveNetFastDPFlowsLow` and `WeaveNetIPAMUnreachable`.
[embedmd]:# (../examples/weave-net-example.jsonnet)
```jsonnet
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'weave-net' then
group {
rules: std.map(function(rule)
if rule.alert == "WeaveNetFastDPFlowsLow" then
rule {
expr: "sum(weave_flows) < 20000"
}
else if rule.alert == "WeaveNetIPAMUnreachable" then
rule {
expr: "weave_ipam_unreachable_percentage > 25"
}
else
rule
,
group.rules
)
}
else
group,
super.groups
),
},
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
```
- After you have the required yamls file please run
```
kubectl create -f prometheus-serviceWeaveNet.yaml
kubectl create -f prometheus-serviceMonitorWeaveNet.yaml
kubectl apply -f prometheus-rules.yaml
kubectl apply -f grafana-dashboardDefinitions.yaml
```

View File

@@ -0,0 +1,39 @@
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + {
_config+:: {
namespace: 'monitoring',
},
prometheusAlerts+:: {
groups: std.map(
function(group)
if group.name == 'weave-net' then
group {
rules: std.map(function(rule)
if rule.alert == "WeaveNetFastDPFlowsLow" then
rule {
expr: "sum(weave_flows) < 20000"
}
else if rule.alert == "WeaveNetIPAMUnreachable" then
rule {
expr: "weave_ipam_unreachable_percentage > 25"
}
else
rule
,
group.rules
)
}
else
group,
super.groups
),
},
};
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,189 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
serviceWeaveNet:
service.new('weave-net', { 'k8s-app': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
service.mixin.spec.withClusterIp('None'),
serviceMonitorWeaveNet: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'weave-net',
labels: {
'k8s-app': 'weave-net',
},
namespace: 'monitoring',
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'weave-metrics',
path: '/metrics',
interval: '15s',
},
],
namespaceSelector: {
matchNames: [
'kube-system',
],
},
selector: {
matchLabels: {
'k8s-app': 'weave-net',
},
},
},
},
},
prometheusRules+: {
groups+: [
{
name: 'weave-net',
rules: [
{
alert: 'WeaveNetIPAMSplitBrain',
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMUnreachable',
expr: 'weave_ipam_unreachable_percentage > 25',
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMPendingAllocates',
expr: 'sum(weave_ipam_pending_allocates) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of pending allocates is above the threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMPendingClaims',
expr: 'sum(weave_ipam_pending_claims) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of pending claims is above the threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetFastDPFlowsLow',
expr: 'sum(weave_flows) < 15000',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of FastDP flows is below the threshold.',
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
},
},
{
alert: 'WeaveNetFastDPFlowsOff',
expr: 'sum(weave_flows == bool 0) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'FastDP flows is zero.',
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
},
},
{
alert: 'WeaveNetHighConnectionTerminationRate',
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are getting terminated.',
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
},
},
{
alert: 'WeaveNetConnectionsConnecting',
expr: 'sum(weave_connections{state="connecting"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in connecting state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsRetying',
expr: 'sum(weave_connections{state="retrying"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in retrying state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsPending',
expr: 'sum(weave_connections{state="pending"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in pending state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsFailed',
expr: 'sum(weave_connections{state="failed"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in failed state.',
description: 'actionable: Please find the reason and fix it.',
},
},
],
},
],
},
grafanaDashboards+:: {
'weave-net.json': (import 'grafana-weave-net.json'),
'weave-net-cluster.json': (import 'grafana-weave-net-cluster.json'),
},
}