Merge pull request #425 from alok87/weave-net
Weave Net Monitoring setup using kube-prometheus
This commit is contained in:
3347
jsonnet/kube-prometheus/grafana-weave-net-cluster.json
Normal file
3347
jsonnet/kube-prometheus/grafana-weave-net-cluster.json
Normal file
File diff suppressed because it is too large
Load Diff
2605
jsonnet/kube-prometheus/grafana-weave-net.json
Normal file
2605
jsonnet/kube-prometheus/grafana-weave-net.json
Normal file
File diff suppressed because it is too large
Load Diff
189
jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet
Normal file
189
jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet
Normal file
@@ -0,0 +1,189 @@
|
||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local service = k.core.v1.service;
|
||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||
|
||||
{
|
||||
prometheus+: {
|
||||
serviceWeaveNet:
|
||||
service.new('weave-net', { 'k8s-app': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'k8s-app': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'k8s-app',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'k8s-app': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'weave-net',
|
||||
rules: [
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending allocates is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending claims is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of FastDP flows is below the threshold.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'FastDP flows is zero.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated.',
|
||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state.',
|
||||
description: 'actionable: Please find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
grafanaDashboards+:: {
|
||||
'weave-net.json': (import 'grafana-weave-net.json'),
|
||||
'weave-net-cluster.json': (import 'grafana-weave-net-cluster.json'),
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user