Merge pull request #875 from paulfantom/directory-layout
Simplify directory structure
This commit is contained in:
@@ -25,14 +25,14 @@
|
||||
alertmanager+:: {
|
||||
alertmanager+: {
|
||||
spec+:
|
||||
antiaffinity('alertmanager', [$._config.alertmanager.name], $._config.namespace),
|
||||
antiaffinity('alertmanager', [$.values.alertmanager.name], $.values.common.namespace),
|
||||
},
|
||||
},
|
||||
|
||||
prometheus+:: {
|
||||
prometheus+: {
|
||||
spec+:
|
||||
antiaffinity('prometheus', [$._config.prometheus.name], $._config.namespace),
|
||||
antiaffinity('prometheus', [$.values.prometheus.name], $.values.common.namespace),
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,4 +1,18 @@
|
||||
local l = import 'lib/lib.libsonnet';
|
||||
local imageName(image) =
|
||||
local parts = std.split(image, '/');
|
||||
local len = std.length(parts);
|
||||
if len == 3 then
|
||||
// registry.com/org/image
|
||||
parts[2]
|
||||
else if len == 2 then
|
||||
// org/image
|
||||
parts[1]
|
||||
else if len == 1 then
|
||||
// image, ie. busybox
|
||||
parts[0]
|
||||
else
|
||||
error 'unknown image format: ' + image;
|
||||
|
||||
|
||||
// withImageRepository is a mixin that replaces all images prefixes by repository. eg.
|
||||
// quay.io/coreos/addon-resizer -> $repository/addon-resizer
|
||||
@@ -6,8 +20,8 @@ local l = import 'lib/lib.libsonnet';
|
||||
local withImageRepository(repository) = {
|
||||
local oldRepos = super._config.imageRepos,
|
||||
local substituteRepository(image, repository) =
|
||||
if repository == null then image else repository + '/' + l.imageName(image),
|
||||
_config+:: {
|
||||
if repository == null then image else repository + '/' + imageName(image),
|
||||
values+:: {
|
||||
imageRepos:: {
|
||||
[field]: substituteRepository(oldRepos[field], repository)
|
||||
for field in std.objectFields(oldRepos)
|
||||
@@ -2,9 +2,9 @@
|
||||
// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
prometheusAdapter+:: {
|
||||
namespace: $._config.namespace,
|
||||
values+:: {
|
||||
prometheusAdapter+: {
|
||||
namespace: $.values.common.namespace,
|
||||
// Rules for custom-metrics
|
||||
config+:: {
|
||||
rules+: [
|
||||
@@ -78,7 +78,7 @@
|
||||
},
|
||||
},
|
||||
|
||||
prometheusAdapter+:: {
|
||||
prometheusAdapter+: {
|
||||
customMetricsApiService: {
|
||||
apiVersion: 'apiregistration.k8s.io/v1',
|
||||
kind: 'APIService',
|
||||
@@ -88,7 +88,7 @@
|
||||
spec: {
|
||||
service: {
|
||||
name: $.prometheusAdapter.service.metadata.name,
|
||||
namespace: $._config.prometheusAdapter.namespace,
|
||||
namespace: $.values.prometheusAdapter.namespace,
|
||||
},
|
||||
group: 'custom.metrics.k8s.io',
|
||||
version: 'v1beta1',
|
||||
@@ -106,7 +106,7 @@
|
||||
spec: {
|
||||
service: {
|
||||
name: $.prometheusAdapter.service.metadata.name,
|
||||
namespace: $._config.prometheusAdapter.namespace,
|
||||
namespace: $.values.prometheusAdapter.namespace,
|
||||
},
|
||||
group: 'custom.metrics.k8s.io',
|
||||
version: 'v1beta2',
|
||||
@@ -141,7 +141,7 @@
|
||||
subjects: [{
|
||||
kind: 'ServiceAccount',
|
||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
||||
namespace: $._config.prometheusAdapter.namespace,
|
||||
namespace: $.values.prometheusAdapter.namespace,
|
||||
}],
|
||||
},
|
||||
customMetricsClusterRoleBindingHPA: {
|
||||
@@ -2,9 +2,9 @@
|
||||
// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
prometheusAdapter+:: {
|
||||
namespace: $._config.namespace,
|
||||
values+:: {
|
||||
prometheusAdapter+: {
|
||||
namespace: $.values.common.namespace,
|
||||
// Rules for external-metrics
|
||||
config+:: {
|
||||
externalRules+: [
|
||||
@@ -24,7 +24,7 @@
|
||||
},
|
||||
},
|
||||
|
||||
prometheusAdapter+:: {
|
||||
prometheusAdapter+: {
|
||||
externalMetricsApiService: {
|
||||
apiVersion: 'apiregistration.k8s.io/v1',
|
||||
kind: 'APIService',
|
||||
@@ -34,7 +34,7 @@
|
||||
spec: {
|
||||
service: {
|
||||
name: $.prometheusAdapter.service.metadata.name,
|
||||
namespace: $._config.prometheusAdapter.namespace,
|
||||
namespace: $.values.prometheusAdapter.namespace,
|
||||
},
|
||||
group: 'external.metrics.k8s.io',
|
||||
version: 'v1beta1',
|
||||
@@ -70,7 +70,7 @@
|
||||
subjects: [{
|
||||
kind: 'ServiceAccount',
|
||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
||||
namespace: $._config.prometheusAdapter.namespace,
|
||||
namespace: $.values.prometheusAdapter.namespace,
|
||||
}],
|
||||
},
|
||||
externalMetricsClusterRoleBindingHPA: {
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
prometheus+:: {
|
||||
prometheus+: {
|
||||
serviceMonitorKubelet+:
|
||||
{
|
||||
spec+: {
|
||||
@@ -1,14 +1,15 @@
|
||||
{
|
||||
_config+:: {
|
||||
versions+:: { clusterVerticalAutoscaler: '0.8.1' },
|
||||
imageRepos+:: { clusterVerticalAutoscaler: 'gcr.io/google_containers/cpvpa-amd64' },
|
||||
|
||||
kubeStateMetrics+:: {
|
||||
values+:: {
|
||||
clusterVerticalAutoscaler: {
|
||||
version: '0.8.1',
|
||||
image: 'gcr.io/google_containers/cpvpa-amd64:v0.8.1',
|
||||
baseCPU: '1m',
|
||||
stepCPU: '1m',
|
||||
baseMemory: '1Mi',
|
||||
stepMemory: '2Mi',
|
||||
},
|
||||
},
|
||||
ksmAutoscaler+:: {
|
||||
ksmAutoscaler+: {
|
||||
clusterRole: {
|
||||
apiVersion: 'rbac.authorization.k8s.io/v1',
|
||||
kind: 'ClusterRole',
|
||||
@@ -29,7 +30,7 @@
|
||||
kind: 'ClusterRole',
|
||||
name: 'ksm-autoscaler',
|
||||
},
|
||||
subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $._config.namespace }],
|
||||
subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $.values.common.namespace }],
|
||||
},
|
||||
|
||||
roleBinding: {
|
||||
@@ -37,7 +38,7 @@
|
||||
kind: 'RoleBinding',
|
||||
metadata: {
|
||||
name: 'ksm-autoscaler',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
},
|
||||
roleRef: {
|
||||
apiGroup: 'rbac.authorization.k8s.io',
|
||||
@@ -52,7 +53,7 @@
|
||||
kind: 'Role',
|
||||
metadata: {
|
||||
name: 'ksm-autoscaler',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
},
|
||||
rules: [
|
||||
{
|
||||
@@ -75,7 +76,7 @@
|
||||
kind: 'ServiceAccount',
|
||||
metadata: {
|
||||
name: 'ksm-autoscaler',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
},
|
||||
},
|
||||
|
||||
@@ -83,14 +84,21 @@
|
||||
local podLabels = { app: 'ksm-autoscaler' };
|
||||
local c = {
|
||||
name: 'ksm-autoscaler',
|
||||
image: $._config.imageRepos.clusterVerticalAutoscaler + ':v' + $._config.versions.clusterVerticalAutoscaler,
|
||||
image: $.values.clusterVerticalAutoscaler.image,
|
||||
args: [
|
||||
'/cpvpa',
|
||||
'--target=deployment/kube-state-metrics',
|
||||
'--namespace=' + $._config.namespace,
|
||||
'--namespace=' + $.values.common.namespace,
|
||||
'--logtostderr=true',
|
||||
'--poll-period-seconds=10',
|
||||
'--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}}}}',
|
||||
'--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU +
|
||||
'","step":"' + $.values.clusterVerticalAutoscaler.stepCPU +
|
||||
'","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory +
|
||||
'","step":"' + $.values.clusterVerticalAutoscaler.stepMemory +
|
||||
'","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU +
|
||||
'","step":"' + $.values.clusterVerticalAutoscaler.stepCPU +
|
||||
'","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory +
|
||||
'","step":"' + $.values.clusterVerticalAutoscaler.stepMemory + '","nodesPerStep":1}}}}',
|
||||
],
|
||||
resources: {
|
||||
requests: { cpu: '20m', memory: '10Mi' },
|
||||
@@ -102,7 +110,7 @@
|
||||
kind: 'Deployment',
|
||||
metadata: {
|
||||
name: 'ksm-autoscaler',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
labels: podLabels,
|
||||
},
|
||||
spec: {
|
||||
@@ -1,8 +1,8 @@
|
||||
// On managed Kubernetes clusters some of the control plane components are not exposed to customers.
|
||||
// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'kube-prometheus.libsonnet' defaults
|
||||
// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'main.libsonnet' defaults
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
values+:: {
|
||||
// This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object
|
||||
// excluding any members of the set specified (eg: controller and scheduler).
|
||||
local j = super.jobs,
|
||||
@@ -13,7 +13,7 @@
|
||||
},
|
||||
|
||||
// Skip alerting rules too
|
||||
prometheus+:: {
|
||||
prometheus+: {
|
||||
rules+:: {
|
||||
local g = super.groups,
|
||||
groups: [
|
||||
@@ -1,5 +1,5 @@
|
||||
(import 'github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet') + {
|
||||
_config+:: {
|
||||
values+:: {
|
||||
etcd: {
|
||||
ips: [],
|
||||
clientCA: null,
|
||||
@@ -9,7 +9,7 @@
|
||||
insecureSkipVerify: null,
|
||||
},
|
||||
},
|
||||
prometheus+:: {
|
||||
prometheus+: {
|
||||
serviceEtcd: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
@@ -36,7 +36,7 @@
|
||||
subsets: [{
|
||||
addresses: [
|
||||
{ ip: etcdIP }
|
||||
for etcdIP in $._config.etcd.ips
|
||||
for etcdIP in $.values.etcd.ips
|
||||
],
|
||||
ports: [
|
||||
{ name: 'metrics', port: 2379, protocol: 'TCP' },
|
||||
@@ -65,8 +65,8 @@
|
||||
caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt',
|
||||
keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key',
|
||||
certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt',
|
||||
[if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName,
|
||||
[if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify,
|
||||
[if $.values.etcd.serverName != null then 'serverName']: $.values.etcd.serverName,
|
||||
[if $.values.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $.values.etcd.insecureSkipVerify,
|
||||
},
|
||||
},
|
||||
],
|
||||
@@ -84,12 +84,12 @@
|
||||
type: 'Opaque',
|
||||
metadata: {
|
||||
name: 'kube-etcd-client-certs',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
},
|
||||
data: {
|
||||
'etcd-client-ca.crt': std.base64($._config.etcd.clientCA),
|
||||
'etcd-client.key': std.base64($._config.etcd.clientKey),
|
||||
'etcd-client.crt': std.base64($._config.etcd.clientCert),
|
||||
'etcd-client-ca.crt': std.base64($.values.etcd.clientCA),
|
||||
'etcd-client.key': std.base64($.values.etcd.clientKey),
|
||||
'etcd-client.crt': std.base64($.values.etcd.clientCert),
|
||||
},
|
||||
},
|
||||
prometheus+: {
|
||||
48
jsonnet/kube-prometheus/addons/strip-limits.libsonnet
Normal file
48
jsonnet/kube-prometheus/addons/strip-limits.libsonnet
Normal file
@@ -0,0 +1,48 @@
|
||||
// Strips spec.containers[].limits for certain containers
|
||||
// https://github.com/prometheus-operator/kube-prometheus/issues/72
|
||||
|
||||
{
|
||||
local noLimit(c) =
|
||||
//if std.objectHas(c, 'resources') && c.name != 'kube-state-metrics'
|
||||
if c.name != 'kube-state-metrics'
|
||||
then c { resources+: { limits: {} } }
|
||||
else c,
|
||||
|
||||
nodeExporter+: {
|
||||
daemonset+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
containers: std.map(noLimit, super.containers),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
kubeStateMetrics+: {
|
||||
deployment+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
containers: std.map(noLimit, super.containers),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusOperator+: {
|
||||
deployment+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
local addArgs(c) =
|
||||
if c.name == 'prometheus-operator'
|
||||
then c { args+: ['--config-reloader-cpu=0'] }
|
||||
else c,
|
||||
containers: std.map(addArgs, super.containers),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,16 +1,16 @@
|
||||
(import 'github.com/thanos-io/thanos/mixin/alerts/sidecar.libsonnet') +
|
||||
{
|
||||
_config+:: {
|
||||
versions+:: { thanos: 'v0.14.0' },
|
||||
imageRepos+:: { thanos: 'quay.io/thanos/thanos' },
|
||||
values+:: {
|
||||
thanos+:: {
|
||||
version: '0.14.0',
|
||||
image: 'quay.io/thanos/thanos:v0.14.0',
|
||||
objectStorageConfig: {
|
||||
key: 'thanos.yaml', // How the file inside the secret is called
|
||||
name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheus+:: {
|
||||
prometheus+: {
|
||||
local p = self,
|
||||
|
||||
// Add the grpc port to the Prometheus service to be able to query it with the Thanos Querier
|
||||
@@ -26,9 +26,9 @@
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'prometheus-' + p.name + '-thanos-sidecar',
|
||||
namespace: p.namespace,
|
||||
labels: { prometheus: p.name, app: 'thanos-sidecar' },
|
||||
name: 'prometheus-' + p.config.name + '-thanos-sidecar',
|
||||
namespace: p.config.namespace,
|
||||
labels: { prometheus: p.config.name, app: 'thanos-sidecar' },
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
@@ -42,9 +42,9 @@
|
||||
prometheus+: {
|
||||
spec+: {
|
||||
thanos+: {
|
||||
version: $._config.versions.thanos,
|
||||
image: $._config.imageRepos.thanos + ':' + $._config.versions.thanos,
|
||||
objectStorageConfig: $._config.thanos.objectStorageConfig,
|
||||
version: $.values.thanos.version,
|
||||
image: $.values.thanos.image,
|
||||
objectStorageConfig: $.values.thanos.objectStorageConfig,
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -54,7 +54,7 @@
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'thanos-sidecar',
|
||||
namespace: p.namespace,
|
||||
namespace: p.config.namespace,
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'prometheus',
|
||||
},
|
||||
@@ -64,7 +64,7 @@
|
||||
jobLabel: 'app',
|
||||
selector: {
|
||||
matchLabels: {
|
||||
prometheus: p.name,
|
||||
prometheus: p.config.name,
|
||||
app: 'thanos-sidecar',
|
||||
},
|
||||
},
|
||||
134
jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet
Normal file
134
jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet
Normal file
@@ -0,0 +1,134 @@
|
||||
[
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending allocates is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending claims is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of FastDP flows is below the threshold.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'FastDP flows is zero.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated.',
|
||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state.',
|
||||
description: 'actionable: Please find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
]
|
||||
73
jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet
Normal file
73
jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet
Normal file
@@ -0,0 +1,73 @@
|
||||
{
|
||||
prometheus+: {
|
||||
local p = self,
|
||||
serviceWeaveNet: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
namespace: 'kube-system',
|
||||
labels: { 'app.kubernetes.io/name': 'weave-net' },
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{ name: 'weave-net-metrics', targetPort: 6782, port: 6782 },
|
||||
],
|
||||
selector: { name: 'weave-net' },
|
||||
clusterIP: 'None',
|
||||
},
|
||||
},
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'app.kubernetes.io/name',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-net-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRuleWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: p.config.mixin.ruleLabels,
|
||||
name: 'weave-net-rules',
|
||||
namespace: p.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
groups: [{
|
||||
name: 'weave-net',
|
||||
rules: (import './alerts.libsonnet'),
|
||||
}],
|
||||
},
|
||||
},
|
||||
mixin+:: {
|
||||
grafanaDashboards+:: {
|
||||
'weave-net.json': (import './grafana-weave-net.json'),
|
||||
'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
local krp = import '../kube-rbac-proxy/container.libsonnet';
|
||||
local krp = import './kube-rbac-proxy.libsonnet';
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
@@ -1,4 +1,4 @@
|
||||
local krp = (import '../kube-rbac-proxy/container.libsonnet');
|
||||
local krp = import './kube-rbac-proxy.libsonnet';
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
@@ -1,4 +1,4 @@
|
||||
local krp = (import '../kube-rbac-proxy/container.libsonnet');
|
||||
local krp = import './kube-rbac-proxy.libsonnet';
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
@@ -1,4 +1,4 @@
|
||||
local krp = (import '../kube-rbac-proxy/container.libsonnet');
|
||||
local krp = import './kube-rbac-proxy.libsonnet';
|
||||
local prometheusOperator = import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet';
|
||||
|
||||
local defaults = {
|
||||
@@ -1,4 +1,4 @@
|
||||
local relabelings = import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet';
|
||||
local relabelings = import '../addons/dropping-deprecated-metrics-relabelings.libsonnet';
|
||||
|
||||
local defaults = {
|
||||
local defaults = self,
|
||||
@@ -1,8 +0,0 @@
|
||||
local kp = (import './kube-prometheus/kube-prometheus.libsonnet');
|
||||
|
||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
||||
@@ -1,35 +0,0 @@
|
||||
// Strips spec.containers[].limits for certain containers
|
||||
// https://github.com/prometheus-operator/kube-prometheus/issues/72
|
||||
{
|
||||
_config+:: {
|
||||
resources+:: {
|
||||
'addon-resizer'+: {
|
||||
limits: {},
|
||||
},
|
||||
'kube-rbac-proxy'+: {
|
||||
limits: {},
|
||||
},
|
||||
'kube-state-metrics'+: {
|
||||
limits: {},
|
||||
},
|
||||
'node-exporter'+: {
|
||||
limits: {},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusOperator+: {
|
||||
deployment+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
local addArgs(c) =
|
||||
if c.name == 'prometheus-operator'
|
||||
then c { args+: ['--config-reloader-cpu=0'] }
|
||||
else c,
|
||||
containers: std.map(addArgs, super.containers),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,196 +0,0 @@
|
||||
{
|
||||
prometheus+: {
|
||||
serviceWeaveNet: {
|
||||
apiVersion: 'v1',
|
||||
kind: 'Service',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
namespace: 'kube-system',
|
||||
labels: { 'app.kubernetes.io/name': 'weave-net' },
|
||||
},
|
||||
spec: {
|
||||
ports: [
|
||||
{ name: 'weave-net-metrics', targetPort: 6782, port: 6782 },
|
||||
],
|
||||
selector: { name: 'weave-net' },
|
||||
clusterIP: 'None',
|
||||
},
|
||||
},
|
||||
serviceMonitorWeaveNet: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'weave-net',
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
namespace: 'monitoring',
|
||||
},
|
||||
spec: {
|
||||
jobLabel: 'app.kubernetes.io/name',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'weave-net-metrics',
|
||||
path: '/metrics',
|
||||
interval: '15s',
|
||||
},
|
||||
],
|
||||
namespaceSelector: {
|
||||
matchNames: [
|
||||
'kube-system',
|
||||
],
|
||||
},
|
||||
selector: {
|
||||
matchLabels: {
|
||||
'app.kubernetes.io/name': 'weave-net',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'weave-net',
|
||||
rules: [
|
||||
{
|
||||
alert: 'WeaveNetIPAMSplitBrain',
|
||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMUnreachable',
|
||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingAllocates',
|
||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending allocates is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetIPAMPendingClaims',
|
||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of pending claims is above the threshold.',
|
||||
description: 'actionable: Please find the problem and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsLow',
|
||||
expr: 'sum(weave_flows) < 15000',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Number of FastDP flows is below the threshold.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetFastDPFlowsOff',
|
||||
expr: 'sum(weave_flows == bool 0) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'FastDP flows is zero.',
|
||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are getting terminated.',
|
||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsConnecting',
|
||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in connecting state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsRetying',
|
||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in retrying state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsPending',
|
||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in pending state.',
|
||||
description: 'actionable: Please find the reason for this and fix it.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'WeaveNetConnectionsFailed',
|
||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A lot of connections are in failed state.',
|
||||
description: 'actionable: Please find the reason and fix it.',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
grafanaDashboards+:: {
|
||||
'weave-net.json': (import './grafana-weave-net.json'),
|
||||
'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'),
|
||||
},
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
// imageName extracts the image name from a fully qualified image string. eg.
|
||||
// quay.io/coreos/addon-resizer -> addon-resizer
|
||||
// grafana/grafana -> grafana
|
||||
local imageName(image) =
|
||||
local parts = std.split(image, '/');
|
||||
local len = std.length(parts);
|
||||
if len == 3 then
|
||||
// registry.com/org/image
|
||||
parts[2]
|
||||
else if len == 2 then
|
||||
// org/image
|
||||
parts[1]
|
||||
else if len == 1 then
|
||||
// image, ie. busybox
|
||||
parts[0]
|
||||
else
|
||||
error 'unknown image format: ' + image;
|
||||
|
||||
{
|
||||
imageName:: imageName,
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
(import 'image.libsonnet')
|
||||
@@ -1,13 +1,13 @@
|
||||
local alertmanager = import './alertmanager/alertmanager.libsonnet';
|
||||
local blackboxExporter = import './blackbox-exporter/blackbox-exporter.libsonnet';
|
||||
local grafana = import './grafana/grafana.libsonnet';
|
||||
local kubeStateMetrics = import './kube-state-metrics/kube-state-metrics.libsonnet';
|
||||
local customMixin = import './mixin/custom.libsonnet';
|
||||
local kubernetesMixin = import './mixin/kubernetes.libsonnet';
|
||||
local nodeExporter = import './node-exporter/node-exporter.libsonnet';
|
||||
local prometheusAdapter = import './prometheus-adapter/prometheus-adapter.libsonnet';
|
||||
local prometheusOperator = import './prometheus-operator/prometheus-operator.libsonnet';
|
||||
local prometheus = import './prometheus/prometheus.libsonnet';
|
||||
local alertmanager = import './components/alertmanager.libsonnet';
|
||||
local blackboxExporter = import './components/blackbox-exporter.libsonnet';
|
||||
local grafana = import './components/grafana.libsonnet';
|
||||
local kubeStateMetrics = import './components/kube-state-metrics.libsonnet';
|
||||
local customMixin = import './components/mixin/custom.libsonnet';
|
||||
local kubernetesMixin = import './components/mixin/kubernetes.libsonnet';
|
||||
local nodeExporter = import './components/node-exporter.libsonnet';
|
||||
local prometheusAdapter = import './components/prometheus-adapter.libsonnet';
|
||||
local prometheusOperator = import './components/prometheus-operator.libsonnet';
|
||||
local prometheus = import './components/prometheus.libsonnet';
|
||||
|
||||
{
|
||||
// using `values` as this is similar to helm
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
_config+:: {
|
||||
values+:: {
|
||||
eks: {
|
||||
minimumAvailableIPs: 10,
|
||||
minimumAvailableIPsTime: '10m',
|
||||
@@ -39,7 +39,7 @@
|
||||
kind: 'ServiceMonitor',
|
||||
metadata: {
|
||||
name: 'awsekscni',
|
||||
namespace: $._config.namespace,
|
||||
namespace: $.values.common.namespace,
|
||||
labels: {
|
||||
'app.kubernetes.io/name': 'eks-cni',
|
||||
},
|
||||
@@ -65,25 +65,34 @@
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
prometheusRules+: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-eks.rules',
|
||||
rules: [
|
||||
prometheusRuleEksCNI: {
|
||||
apiVersion: 'monitoring.coreos.com/v1',
|
||||
kind: 'PrometheusRule',
|
||||
metadata: {
|
||||
labels: $.prometheus.config.commonLabels + $.prometheus.config.mixin.ruleLabels,
|
||||
name: 'eks-rules',
|
||||
namespace: $.prometheus.config.namespace,
|
||||
},
|
||||
spec: {
|
||||
groups: [
|
||||
{
|
||||
expr: 'sum by(instance) (awscni_ip_max) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.',
|
||||
},
|
||||
'for': $._config.eks.minimumAvailableIPsTime,
|
||||
alert: 'EksAvailableIPs',
|
||||
name: 'kube-prometheus-eks.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum by(instance) (awscni_ip_max) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $.values.eks.minimumAvailableIPs,
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.',
|
||||
},
|
||||
'for': $.values.eks.minimumAvailableIPsTime,
|
||||
alert: 'EksAvailableIPs',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
(import './kube-prometheus-managed-cluster.libsonnet') + {
|
||||
(import '../addons/managed-cluster.libsonnet') + {
|
||||
_config+:: {
|
||||
prometheusAdapter+:: {
|
||||
config+: {
|
||||
Reference in New Issue
Block a user