add ConfigMap generate script

This also contains the generated ConfigMaps for the grafana dashboards
as well as for alerts. These will be updated in subsequent changes as
sets of alerts and dashboards are added/updated in the respective
subdirectories in `/assets`.
This commit is contained in:
Frederic Branczyk
2016-10-28 16:42:53 +02:00
parent 74a80cd19e
commit 12cd0fd34f
10 changed files with 3019 additions and 215 deletions

View File

@@ -1,206 +0,0 @@
# general cluster availability
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
# etcd leader alerts
# ==================
# alert if any etcd instance has no leader
ALERT EtcdNoLeader
IF etcd_server_has_leader{job="etcd"} == 0
FOR 1m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "etcd node has no leader",
description = "etcd node {{ $labels.instance }} has no leader",
}
# alert if there are lots of leader changes
ALERT HighNumberOfLeaderChanges
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of leader changes within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
}
# gRPC request alerts
# ===================
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
ALERT HighNumberOfFailedGRPCRequests
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of gRPC requests are failing",
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
ALERT HighNumberOfFailedGRPCRequests
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of gRPC requests are failing",
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of gRPC method calls take more than 150ms
ALERT GRPCRequestsSlow
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "slow gRPC requests",
description = "on ectd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
}
# HTTP requests alerts
# ====================
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
# file descriptor alerts
# ======================
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# etcd peer communication alerts
# ==============================
# alert if 99th percentile of round trips take 150ms
ALERT EtcdPeerCommunicationSlow
IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "etcd peer communication is slow",
description = "ectd instance {{ $labels.instance }} peer communication with {{ $label.To }} is slow",
}
# etcd proposal alerts
# ====================
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
# etcd disk io latency alerts
# ===========================
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
}
# alert if 99th percentile of commit durations is higher than 250ms
ALERT HighCommitDurations
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high commit durations",
description = "ectd instance {{ $labels.instance }} commit durations are high",
}

0
assets/alerts/.gitkeep Normal file
View File

View File

@@ -0,0 +1,588 @@
{
"dashboard": {
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "3.1.1"
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
}
],
"annotations": {
"list": []
},
"description": "",
"editable": true,
"gnetId": 162,
"hideControls": true,
"id": null,
"links": [
{
"icon": "external link",
"includeVars": true,
"tags": [],
"targetBlank": false,
"type": "dashboards"
}
],
"refresh": "10s",
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 6,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "sum(sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",instance=~\"$node\"}[1m] ) )) / count(node_cpu{mode=\"system\",instance=~\"$node:9100\"}) * 100",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"thresholds": "65, 90",
"title": "Cluster CPU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 3,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": false,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",instance=~\"$node\"}[1m])))",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "CPU Usage",
"metric": "container_cpu_usage_seconds_total",
"refId": "A",
"step": 10
},
{
"expr": "count(node_cpu{mode=\"system\",instance=~\"$node:9100\"})",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "CPU Total",
"metric": "node_cpu",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Cluster CPU Usage",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 4,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "(sum(node_memory_MemTotal{instance=~\"$node:9100\"}) - sum(node_memory_MemFree{instance=~\"$node:9100\"}+node_memory_Buffers{instance=~\"$node:9100\"}+node_memory_Cached{instance=~\"$node:9100\"}) ) / sum(node_memory_MemTotal{instance=~\"$node:9100\"}) * 100",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"thresholds": "65, 90",
"title": "Cluster memory usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": false,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "(sum(node_memory_MemTotal{instance=~\"$node:9100\"}) - sum(node_memory_MemFree{instance=~\"$node:9100\"}+node_memory_Buffers{instance=~\"$node:9100\"}+node_memory_Cached{instance=~\"$node:9100\"}))",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Cluster Memory Usage",
"metric": "node_memory_MemAvailable",
"refId": "A",
"step": 10
},
{
"expr": "sum(node_memory_MemTotal{instance=~\"$node:9100\"})",
"interval": "10s",
"intervalFactor": 2,
"legendFormat": "Cluster Memory Total",
"metric": "node_memory_MemTotal",
"refId": "B",
"step": 20
}
],
"timeFrom": null,
"timeShift": null,
"title": "Cluster Memory Usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 9,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": true,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (rate (node_network_receive_bytes{instance=~\"$node:9100\"}[1m]))",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Received Bytes",
"metric": "network",
"refId": "A",
"step": 10
},
{
"expr": "sum (rate (node_network_transmit_bytes{instance=~\"$node:9100\"}[1m]))",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Transmitted Bytes",
"metric": "network",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Cluster Network I/O",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
}
],
"schemaVersion": 12,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Node",
"multi": false,
"name": "node",
"options": [],
"query": "label_values(kube_node_info, node)",
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Kubernetes Cluster Monitoring",
"version": 0
},
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}

View File

@@ -0,0 +1,494 @@
{
"dashboard": {
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "3.1.1"
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
}
],
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"hideControls": true,
"id": null,
"links": [],
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 2,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Metadata Generation",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 3,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Observed Generation",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 4,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "kube_deployment_spec_paused{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Paused",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"refId": "A",
"step": 30
},
{
"expr": "kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 30
},
{
"expr": "kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "C",
"step": 30
},
{
"expr": "kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"legendFormat": "",
"refId": "D",
"step": 30
},
{
"expr": "kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}",
"intervalFactor": 2,
"refId": "E",
"step": 30
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Deployment Replicas",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
}
],
"schemaVersion": 12,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "deployment_namespace",
"options": [],
"query": "label_values(kube_deployment_metadata_generation, namespace)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": null,
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Deployment",
"multi": false,
"name": "deployment_name",
"options": [],
"query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "deployment",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Deployment",
"version": 3
},
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}

View File

@@ -0,0 +1,409 @@
{
"dashboard": {
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "3.1.1"
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
}
],
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"hideControls": false,
"id": null,
"links": [],
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Current: {{ container_name }}",
"metric": "container_memory_usage_bytes",
"refId": "A",
"step": 10
},
{
"expr": "kube_pod_container_requested_memory_bytes{pod=\"$pod\", container=~\"$container\"}",
"interval": "10s",
"intervalFactor": 2,
"legendFormat": "Requested: {{ container }}",
"metric": "kube_pod_container_requested_memory_bytes",
"refId": "B",
"step": 20
}
],
"timeFrom": null,
"timeShift": null,
"title": "Memory Usage",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )",
"intervalFactor": 2,
"legendFormat": "{{ container_name }}",
"refId": "A",
"step": 30
}
],
"timeFrom": null,
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }}",
"refId": "A",
"step": 30
}
],
"timeFrom": null,
"timeShift": null,
"title": "Network I/O",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
}
],
"schemaVersion": 12,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": "label_values(kube_pod_info, namespace)",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Pod",
"multi": false,
"name": "pod",
"options": [],
"query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Container",
"multi": false,
"name": "container",
"options": [],
"query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)",
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Pods",
"version": 26
},
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}

View File

@@ -0,0 +1,7 @@
{
"access": "proxy",
"basicAuth": false,
"name": "prometheus",
"type": "prometheus",
"url": "http://prometheus-k8s.monitoring.svc:9090"
}

View File

@@ -0,0 +1,7 @@
#!/bin/bash
# Generate Alert Rules ConfigMap
kubectl create configmap --dry-run=true prometheus-k8s-alerts --from-file=assets/alerts/ -oyaml > manifests/prometheus/prometheus-k8s-alerts.yaml
# Generate Dashboard ConfigMap
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-cm.yaml

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,7 @@
apiVersion: v1
data:
.gitkeep: ""
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-k8s-alerts

View File

@@ -7,6 +7,9 @@ data:
global:
evaluation_interval: 30s
rule_files:
- /etc/prometheus/rules/*.rules
scrape_configs:
- job_name: kubelets
scrape_interval: 20s