examples,jsonnet: fix weave-net
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
local kp = (import 'kube-prometheus/main.libsonnet') +
|
||||
(import 'kube-prometheus/addons/weave-net.libsonnet') + {
|
||||
(import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + {
|
||||
values+:: {
|
||||
common+: {
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
},
|
||||
prometheus+: {
|
||||
prometheusRule+: {
|
||||
prometheusRuleWeaveNet+: {
|
||||
spec+: {
|
||||
groups: std.map(
|
||||
function(group)
|
||||
|
||||
@@ -1,196 +0,0 @@
|
||||
{
|
||||
prometheus+: {
|
||||
serviceWeaveNet: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
namespace: 'kube-system',
|
||||
labels: { 'app.kubernetes.io/name': 'weave-net' },
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{ name: 'weave-net-metrics', targetPort: 6782, port: 6782 },
|
||||
],
|
||||
selector: { name: 'weave-net' },
|
||||
clusterIP: 'None',
|
||||
},
|
||||
},
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'app.kubernetes.io/name',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-net-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'weave-net',
|
||||
rules: [
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending allocates is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending claims is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of FastDP flows is below the threshold.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'FastDP flows is zero.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated.',
|
||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state.',
|
||||
description: 'actionable: Please find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
grafanaDashboards+:: {
|
||||
'weave-net.json': (import './grafana-weave-net.json'),
|
||||
'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'),
|
||||
},
|
||||
}
|
||||
134
jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet
Normal file
134
jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet
Normal file
@@ -0,0 +1,134 @@
|
||||
[
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending allocates is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending claims is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of FastDP flows is below the threshold.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'FastDP flows is zero.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated.',
|
||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state.',
|
||||
description: 'actionable: Please find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
]
|
||||
73
jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet
Normal file
73
jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet
Normal file
@@ -0,0 +1,73 @@
|
||||
{
|
||||
prometheus+: {
|
||||
local p = self,
|
||||
serviceWeaveNet: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
namespace: 'kube-system',
|
||||
labels: { 'app.kubernetes.io/name': 'weave-net' },
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{ name: 'weave-net-metrics', targetPort: 6782, port: 6782 },
|
||||
],
|
||||
selector: { name: 'weave-net' },
|
||||
clusterIP: 'None',
|
||||
},
|
||||
},
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'app.kubernetes.io/name',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-net-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRuleWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: p.config.mixin.ruleLabels,
|
||||
name: 'weave-net-rules',
|
||||
namespace: p.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
groups: [{
|
||||
name: 'weave-net',
|
||||
rules: (import './alerts.libsonnet'),
|
||||
}],
|
||||
},
|
||||
},
|
||||
mixin+:: {
|
||||
grafanaDashboards+:: {
|
||||
'weave-net.json': (import './grafana-weave-net.json'),
|
||||
'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user