diff --git a/jsonnet/kube-prometheus/components/grafana.libsonnet b/jsonnet/kube-prometheus/components/grafana.libsonnet index d6d8cc8d..a46b0845 100644 --- a/jsonnet/kube-prometheus/components/grafana.libsonnet +++ b/jsonnet/kube-prometheus/components/grafana.libsonnet @@ -24,6 +24,12 @@ local defaults = { if !std.setMember(labelName, ['app.kubernetes.io/version']) }, prometheusName:: error 'must provide prometheus name', + mixin: { + ruleLabels: {}, + _config: { + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/grafana/%s', + }, + }, }; function(params) @@ -40,6 +46,27 @@ function(params) labels: g._config.commonLabels, }, + mixin:: + (import 'github.com/grafana/grafana/grafana-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') + { + _config+:: g._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: g._config.commonLabels + g._config.mixin.ruleLabels, + name: g._config.name + '-rules', + namespace: g._config.namespace, + }, + spec: { + local r = if std.objectHasAll(g.mixin, 'prometheusRules') then g.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(g.mixin, 'prometheusAlerts') then g.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + serviceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index c9b9c67e..b2d99cc7 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -10,6 +10,16 @@ }, "version": "master" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafana", + "subdir": "grafana-mixin" + } + }, + "version": "main", + "name": "grafana-mixin" + }, { "source": { "git": { diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index 9dc5a334..3349c8f6 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -70,7 +70,12 @@ local utils = import './lib/utils.libsonnet'; image: $.values.common.images.grafana, prometheusName: $.values.prometheus.name, // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards - dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesControlPlane.mixin.grafanaDashboards + $.alertmanager.mixin.grafanaDashboards, + dashboards: $.nodeExporter.mixin.grafanaDashboards + + $.prometheus.mixin.grafanaDashboards + + $.kubernetesControlPlane.mixin.grafanaDashboards + + $.alertmanager.mixin.grafanaDashboards + + $.grafana.mixin.grafanaDashboards, + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, kubeStateMetrics: { namespace: $.values.common.namespace, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5c9c7ed8..fe0ea145 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -21,6 +21,16 @@ "version": "69279532f46cb7491bb60a588be534100fdc058d", "sum": "cdKL5kPYfpWSpTCu4qctmh+gWQqL+4YWom6rw9qLYJU=" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafana.git", + "subdir": "grafana-mixin" + } + }, + "version": "88bc47441f75b5956023be0b1f79c371420cdcdd", + "sum": "MkjR7zCgq6MUZgjDzop574tFKoTX2OBr7DTwm1K+Ofs=" + }, { "source": { "git": { diff --git a/kustomization.yaml b/kustomization.yaml index 6dfa024b..084af1b1 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -20,6 +20,7 @@ resources: - ./manifests/grafana-dashboardDefinitions.yaml - ./manifests/grafana-dashboardSources.yaml - ./manifests/grafana-deployment.yaml +- ./manifests/grafana-prometheusRule.yaml - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index ac100edb..85c8945f 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5410,6 +5410,631 @@ items: app.kubernetes.io/version: 8.3.3 name: grafana-dashboard-controller-manager namespace: monitoring +- apiVersion: v1 + data: + grafana-overview.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + + ], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3085, + "iteration": 1631554945276, + "links": [ + + ], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ + + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { + + }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Firing Alerts", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { + + }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Dashboards", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "displayMode": "auto" + }, + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 10, + "options": { + "showHeader": true + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Build Info", + "transformations": [ + { + "id": "labelsToFields", + "options": { + + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "branch": true, + "container": true, + "goversion": true, + "namespace": true, + "pod": true, + "revision": true + }, + "indexByName": { + "Time": 7, + "Value": 11, + "branch": 4, + "container": 8, + "edition": 2, + "goversion": 6, + "instance": 1, + "job": 0, + "namespace": 9, + "pod": 10, + "revision": 5, + "version": 3 + }, + "renameByName": { + + } + } + } + ], + "type": "table" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ] + }, + "overrides": [ + + ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ", + "interval": "", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "$$hashKey": "object:157", + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:158", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ] + }, + "overrides": [ + + ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "$$hashKey": "object:210", + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:211", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "dev-cortex", + "value": "dev-cortex" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": [ + "default/grafana" + ], + "value": [ + "default/grafana" + ] + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, job)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [ + + ], + "query": { + "query": "label_values(grafana_build_info, job)", + "refId": "Billing Admin-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, instance)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [ + + ], + "query": { + "query": "label_values(grafana_build_info, instance)", + "refId": "Billing Admin-instance-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Grafana Overview", + "uid": "6be0s85Mk", + "version": 2 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.3.3 + name: grafana-dashboard-grafana-overview + namespace: monitoring - apiVersion: v1 data: k8s-resources-cluster.json: |- diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index de32bf7c..14cbaaea 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -67,6 +67,9 @@ spec: - mountPath: /grafana-dashboard-definitions/0/controller-manager name: grafana-dashboard-controller-manager readOnly: false + - mountPath: /grafana-dashboard-definitions/0/grafana-overview + name: grafana-dashboard-grafana-overview + readOnly: false - mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster name: grafana-dashboard-k8s-resources-cluster readOnly: false @@ -155,6 +158,9 @@ spec: - configMap: name: grafana-dashboard-controller-manager name: grafana-dashboard-controller-manager + - configMap: + name: grafana-dashboard-grafana-overview + name: grafana-dashboard-grafana-overview - configMap: name: grafana-dashboard-k8s-resources-cluster name: grafana-dashboard-k8s-resources-cluster diff --git a/manifests/grafana-prometheusRule.yaml b/manifests/grafana-prometheusRule.yaml new file mode 100644 index 00000000..a554e3f5 --- /dev/null +++ b/manifests/grafana-prometheusRule.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.3.3 + prometheus: k8s + role: alert-rules + name: grafana-rules + namespace: monitoring +spec: + groups: + - name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} + is experiencing {{ $value | humanize }}% errors' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 + for: 5m + labels: + severity: warning + - name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m