Merge branch 'main' of https://github.com/prometheus-operator/kube-prometheus into example/alertmanager-custom-config

This commit is contained in:
Blizter
2021-10-23 19:13:54 -04:00
118 changed files with 12030 additions and 3794 deletions

2
.github/env vendored Normal file
View File

@@ -0,0 +1,2 @@
golang-version=1.16
kind-version=v0.11.1

View File

@@ -22,6 +22,17 @@ jobs:
with:
go-version: ${{ env.golang-version }}
- run: make --always-make generate validate && git diff --exit-code
check-docs:
runs-on: ubuntu-latest
name: Check Documentation formatting and links
steps:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make check-docs
lint:
runs-on: ubuntu-latest
name: Jsonnet linter

View File

@@ -23,11 +23,15 @@ jobs:
with:
go-version: 1.16
- name: Upgrade versions
id: versions
run: |
export GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}
# Write to temporary file to make update atomic
scripts/generate-versions.sh > /tmp/versions.json
mv /tmp/versions.json jsonnet/kube-prometheus/versions.json
# Get the links to the changelogs of the updated versions and make them
# available to the reviewers
echo ::set-output name=new_changelogs::$(scripts/get-new-changelogs.sh)
if: matrix.branch == 'main'
- name: Update jsonnet dependencies
run: |
@@ -49,7 +53,12 @@ jobs:
This is an automated version and jsonnet dependencies update performed from CI.
Configuration of the workflow is located in `.github/workflows/versions.yaml`
Please review the following changelogs to make sure that we don't miss any important
changes before merging this PR.
${{ steps.versions.outputs.new_changelogs }}
Configuration of the workflow is located in `.github/workflows/versions.yaml`.
## Type of change
@@ -61,6 +70,8 @@ jobs:
```
team-reviewers: kube-prometheus-reviewers
committer: Prometheus Operator Bot <prom-op-bot@users.noreply.github.com>
author: Prometheus Operator Bot <prom-op-bot@users.noreply.github.com>
branch: automated-updates-${{ matrix.branch }}
delete-branch: true
# GITHUB_TOKEN cannot be used as it won't trigger CI in a created PR

3
.gitignore vendored
View File

@@ -5,4 +5,5 @@ vendor/
.swp
crdschemas/
.gitpod/_output/
developer-workspace/gitpod/_output
kind

View File

@@ -24,17 +24,17 @@ tasks:
chmod +x ${PWD}/.git/hooks/pre-commit
- name: run kube-prometheus
command: |
.gitpod/prepare-k3s.sh
.gitpod/deploy-kube-prometheus.sh
developer-workspace/gitpod/prepare-k3s.sh
developer-workspace/common/deploy-kube-prometheus.sh
- name: kernel dev environment
init: |
sudo apt update -y
sudo apt install qemu qemu-system-x86 linux-image-$(uname -r) libguestfs-tools sshpass netcat -y
sudo curl -o /usr/bin/kubectl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
sudo chmod +x /usr/bin/kubectl
.gitpod/prepare-rootfs.sh
developer-workspace/gitpod/prepare-rootfs.sh
command: |
.gitpod/qemu.sh
developer-workspace/gitpod/qemu.sh
ports:
- port: 3000
onOpen: open-browser
@@ -44,4 +44,4 @@ ports:
onOpen: open-browser
vscode:
extensions:
- heptio.jsonnet@0.1.0:woEDU5N62LRdgdz0g/I6sQ==
- heptio.jsonnet

9
.mdox.validate.yaml Normal file
View File

@@ -0,0 +1,9 @@
version: 1
validators:
# Ignore localhost links.
- regex: 'localhost'
type: "ignore"
# Ignore release links.
- regex: 'https:\/\/github\.com\/prometheus-operator\/kube-prometheus\/releases'
type: "ignore"

View File

@@ -2,29 +2,39 @@ SHELL=/bin/bash -o pipefail
BIN_DIR?=$(shell pwd)/tmp/bin
EMBEDMD_BIN=$(BIN_DIR)/embedmd
MDOX_BIN=$(BIN_DIR)/mdox
JB_BIN=$(BIN_DIR)/jb
GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
JSONNET_BIN=$(BIN_DIR)/jsonnet
JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint
JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
KUBECONFORM_BIN=$(BIN_DIR)/kubeconform
TOOLING=$(EMBEDMD_BIN) $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETLINT_BIN) $(JSONNETFMT_BIN) $(KUBECONFORM_BIN)
TOOLING=$(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETLINT_BIN) $(JSONNETFMT_BIN) $(KUBECONFORM_BIN) $(MDOX_BIN)
JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s
all: generate fmt test
MDOX_VALIDATE_CONFIG?=.mdox.validate.yaml
MD_FILES_TO_FORMAT=$(shell find docs developer-workspace examples experimental jsonnet manifests -name "*.md") $(shell ls *.md)
all: generate fmt test docs
.PHONY: clean
clean:
# Remove all files and directories ignored by git.
git clean -Xfd .
.PHONY: generate
generate: manifests **.md
.PHONY: docs
docs: $(MDOX_BIN) $(shell find examples) build.sh example.jsonnet
@echo ">> formatting and local/remote links"
$(MDOX_BIN) fmt --soft-wraps -l --links.localize.address-regex="https://prometheus-operator.dev/.*" --links.validate.config-file=$(MDOX_VALIDATE_CONFIG) $(MD_FILES_TO_FORMAT)
**.md: $(EMBEDMD_BIN) $(shell find examples) build.sh example.jsonnet
$(EMBEDMD_BIN) -w `find . -name "*.md" | grep -v vendor`
.PHONY: check-docs
check-docs: $(MDOX_BIN) $(shell find examples) build.sh example.jsonnet
@echo ">> checking formatting and local/remote links"
$(MDOX_BIN) fmt --soft-wraps --check -l --links.localize.address-regex="https://prometheus-operator.dev/.*" --links.validate.config-file=$(MDOX_VALIDATE_CONFIG) $(MD_FILES_TO_FORMAT)
.PHONY: generate
generate: manifests
manifests: examples/kustomize.jsonnet $(GOJSONTOYAML_BIN) vendor
./build.sh $<
@@ -78,3 +88,8 @@ $(BIN_DIR):
$(TOOLING): $(BIN_DIR)
@echo Installing tools from scripts/tools.go
@cd scripts && cat tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go build -modfile=go.mod -o $(BIN_DIR) %
.PHONY: deploy
deploy:
./developer-workspace/codespaces/prepare-kind.sh
./developer-workspace/common/deploy-kube-prometheus.sh

View File

@@ -80,8 +80,8 @@ You will need a Kubernetes cluster, that's it! By default it is assumed, that th
This means the kubelet configuration must contain these flags:
* `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s). This can also be enabled by setting the kubelet configuration value `authentication.webhook.enabled` to `true`.
* `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allowed to access a resource, in specific for this project the `/metrics` endpoint. This can also be enabled by setting the kubelet configuration value `authorization.mode` to `Webhook`.
* `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s). This can also be enabled by setting the kubelet configuration value `authentication.webhook.enabled` to `true`.
* `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allowed to access a resource, in specific for this project the `/metrics` endpoint. This can also be enabled by setting the kubelet configuration value `authorization.mode` to `Webhook`.
This stack provides [resource metrics](https://github.com/kubernetes/metrics#resource-metrics-api) by deploying the [Prometheus Adapter](https://github.com/DirectXMan12/k8s-prometheus-adapter/).
This adapter is an Extension API Server and Kubernetes needs to be have this feature enabled, otherwise the adapter has no effect, but is still deployed.
@@ -116,12 +116,12 @@ The following versions are supported and work as we test against these versions
## Quickstart
>Note: For versions before Kubernetes v1.21.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch.
> Note: For versions before Kubernetes v1.21.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch.
This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository).
Though for a quickstart a compiled version of the Kubernetes [manifests](manifests) generated with this library (specifically with `example.jsonnet`) is checked into this repository in order to try the content out quickly. To try out the stack un-customized run:
* Create the monitoring stack using the config in the `manifests` directory:
* Create the monitoring stack using the config in the `manifests` directory:
```shell
# Create the namespace and CRDs, and then wait for them to be available before creating the remaining resources
@@ -135,7 +135,8 @@ Alternatively, the resources in both folders can be applied with a single comman
`kubectl create -f manifests/setup -f manifests`, but it may be necessary to run the command multiple times for all components to
be created successfullly.
* And to teardown the stack:
* And to teardown the stack:
```shell
kubectl delete --ignore-not-found=true -f manifests/ -f manifests/setup
```
@@ -173,14 +174,15 @@ Then access via [http://localhost:9093](http://localhost:9093)
## Customizing Kube-Prometheus
This section:
* describes how to customize the kube-prometheus library via compiling the kube-prometheus manifests yourself (as an alternative to the [Quickstart section](#Quickstart)).
* still doesn't require you to make a copy of this entire repository, but rather only a copy of a few select files.
* describes how to customize the kube-prometheus library via compiling the kube-prometheus manifests yourself (as an alternative to the [Quickstart section](#quickstart)).
* still doesn't require you to make a copy of this entire repository, but rather only a copy of a few select files.
### Installing
The content of this project consists of a set of [jsonnet](http://jsonnet.org/) files making up a library to be consumed.
Install this library in your own project with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install) (the jsonnet package manager):
```shell
$ mkdir my-kube-prometheus; cd my-kube-prometheus
$ jb init # Creates the initial/empty `jsonnetfile.json`
@@ -196,6 +198,7 @@ $ wget https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/rel
> An e.g. of how to install a given version of this library: `jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.7`
In order to update the kube-prometheus dependency, simply use the jsonnet-bundler update functionality:
```shell
$ jb update
```
@@ -210,8 +213,7 @@ Here's [example.jsonnet](example.jsonnet):
> Note: some of the following components must be configured beforehand. See [configuration](#configuration) and [customization-examples](#customization-examples).
[embedmd]:# (example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat example.jsonnet"
local kp =
(import 'kube-prometheus/main.libsonnet') +
// Uncomment the following imports to enable its patches
@@ -250,8 +252,7 @@ local kp =
And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`):
[embedmd]:# (build.sh)
```sh
```sh mdox-exec="cat build.sh"
#!/usr/bin/env bash
# This script uses arg $1 (name of *.jsonnet file to use) to generate the manifests/*.yaml files.
@@ -282,6 +283,7 @@ rm -f kustomization
This script runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml.
### Apply the kube-prometheus stack
The previous steps (compilation) has created a bunch of manifest files in the manifest/ folder.
Now simply use `kubectl` to install Prometheus and Grafana as per your configuration:
@@ -290,6 +292,7 @@ Now simply use `kubectl` to install Prometheus and Grafana as per your configura
$ kubectl apply -f manifests/setup
$ kubectl apply -f manifests/
```
Alternatively, the resources in both folders can be applied with a single command
`kubectl apply -Rf manifests`, but it may be necessary to run the command multiple times for all components to
be created successfullly.
@@ -299,15 +302,18 @@ Check the monitoring namespace (or the namespace you have specific in `namespace
### Containerized Installing and Compiling
If you don't care to have `jb` nor `jsonnet` nor `gojsontoyaml` installed, then use `quay.io/coreos/jsonnet-ci` container image. Do the following from this `kube-prometheus` directory:
```shell
$ docker run --rm -v $(pwd):$(pwd) --workdir $(pwd) quay.io/coreos/jsonnet-ci jb update
$ docker run --rm -v $(pwd):$(pwd) --workdir $(pwd) quay.io/coreos/jsonnet-ci ./build.sh example.jsonnet
```
## Update from upstream project
You may wish to fetch changes made on this project so they are available to you.
### Update jb
`jb` may have been updated so it's a good idea to get the latest version of this binary:
```shell
@@ -315,14 +321,16 @@ $ go get -u github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
```
### Update kube-prometheus
The command below will sync with upstream project:
```shell
$ jb update
```
### Compile the manifests and apply
Once updated, just follow the instructions under "Compiling" and "Apply the kube-prometheus stack" to apply the changes to your cluster.
Once updated, just follow the instructions under "Compiling" and "Apply the kube-prometheus stack" to apply the changes to your cluster.
## Configuration
@@ -343,7 +351,8 @@ Configuration is mainly done in the `values` map. You can see this being used in
},
```
The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `values` field. For example to allow anonymous access to grafana, add the following `values` section:
The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana ), but needed configuration can be customized from the same top level `values` field. For example to allow anonymous access to grafana, add the following `values` section:
```
grafana+:: {
config: { // http://docs.grafana.org/installation/configuration/
@@ -366,14 +375,14 @@ A common example is that not all Kubernetes clusters are created exactly the sam
* bootkube
* eks
* gke
* kops-coredns
* kops
* kops_coredns
* kubeadm
* kubespray
These mixins are selectable via the `platform` field of kubePrometheus:
[embedmd]:# (examples/jsonnet-snippets/platform.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/jsonnet-snippets/platform.jsonnet"
(import 'kube-prometheus/main.libsonnet') +
{
values+:: {
@@ -405,8 +414,7 @@ The output of this command can be piped to a shell to be executed by appending `
Then to generate manifests with `internal-registry.com/organization`, use the `withImageRepository` mixin:
[embedmd]:# (examples/internal-registry.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/internal-registry.jsonnet"
local mixin = import 'kube-prometheus/addons/config-mixins.libsonnet';
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
@@ -429,8 +437,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
Another mixin that may be useful for exploring the stack is to expose the UIs of Prometheus, Alertmanager and Grafana on NodePorts:
[embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/jsonnet-snippets/node-ports.jsonnet"
(import 'kube-prometheus/main.libsonnet') +
(import 'kube-prometheus/addons/node-ports.libsonnet')
```
@@ -439,8 +446,7 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of
To give another customization example, the name of the `Prometheus` object provided by this library can be overridden:
[embedmd]:# (examples/prometheus-name-override.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/prometheus-name-override.jsonnet"
((import 'kube-prometheus/main.libsonnet') + {
prometheus+: {
prometheus+: {
@@ -456,8 +462,7 @@ To give another customization example, the name of the `Prometheus` object provi
Standard Kubernetes manifests are all written using [ksonnet-lib](https://github.com/ksonnet/ksonnet-lib/), so they can be modified with the mixins supplied by ksonnet-lib. For example to override the namespace of the node-exporter DaemonSet:
[embedmd]:# (examples/ksonnet-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/ksonnet-example.jsonnet"
((import 'kube-prometheus/main.libsonnet') + {
nodeExporter+: {
daemonset+: {
@@ -473,8 +478,7 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github
The Alertmanager configuration is located in the `values.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field.
[embedmd]:# (examples/alertmanager-config.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/alertmanager-config.jsonnet"
((import 'kube-prometheus/main.libsonnet') + {
values+:: {
alertmanager+: {
@@ -501,8 +505,7 @@ The Alertmanager configuration is located in the `values.alertmanager.config` co
In the above example the configuration has been inlined, but can just as well be an external file imported in jsonnet via the `importstr` function.
[embedmd]:# (examples/alertmanager-config-external.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/alertmanager-config-external.jsonnet"
((import 'kube-prometheus/main.libsonnet') + {
values+:: {
alertmanager+: {
@@ -516,8 +519,7 @@ In the above example the configuration has been inlined, but can just as well be
In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$.values.namespace`. This is specified in `$.values.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces:
[embedmd]:# (examples/additional-namespaces.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/additional-namespaces.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
@@ -547,8 +549,7 @@ In order to Prometheus be able to discovery and scrape services inside the addit
You can define ServiceMonitor resources in your `jsonnet` spec. See the snippet bellow:
[embedmd]:# (examples/additional-namespaces-servicemonitor.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/additional-namespaces-servicemonitor.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
@@ -575,7 +576,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
],
selector: {
matchLabels: {
app: 'myapp',
'app.kubernetes.io/name': 'myapp',
},
},
},
@@ -598,10 +599,9 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
### Monitoring all namespaces
In case you want to monitor all namespaces in a cluster, you can add the following mixin. Also, make sure to empty the namespaces defined in prometheus so that roleBindings are not created against them.
In case you want to monitor all namespaces in a cluster, you can add the following mixin. Also, make sure to empty the namespaces defined in prometheus so that roleBindings are not created against them.
[embedmd]:# (examples/all-namespaces.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/all-namespaces.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') +
(import 'kube-prometheus/addons/all-namespaces.libsonnet') + {
values+:: {
@@ -629,7 +629,7 @@ Proceed with [creating ServiceMonitors for the services in the namespaces](#defi
### Static etcd configuration
In order to configure a static etcd cluster to scrape there is a simple [kube-prometheus-static-etcd.libsonnet](jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) mixin prepared - see [etcd.jsonnet](examples/etcd.jsonnet) for an example of how to use that mixin, and [Monitoring external etcd](docs/monitoring-external-etcd.md) for more information.
In order to configure a static etcd cluster to scrape there is a simple [static-etcd.libsonnet](jsonnet/kube-prometheus/addons/static-etcd.libsonnet) mixin prepared - see [etcd.jsonnet](examples/etcd.jsonnet) for an example of how to use that mixin, and [Monitoring external etcd](docs/monitoring-external-etcd.md) for more information.
> Note that monitoring etcd in minikube is currently not possible because of how etcd is setup. (minikube's etcd binds to 127.0.0.1:2379 only, and within host networking namespace.)
@@ -638,8 +638,7 @@ In order to configure a static etcd cluster to scrape there is a simple [kube-pr
To prevent `Prometheus` and `Alertmanager` instances from being deployed onto the same node when
possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/addons/anti-affinity.libsonnet) mixin:
[embedmd]:# (examples/anti-affinity.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/anti-affinity.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') +
(import 'kube-prometheus/addons/anti-affinity.libsonnet') + {
values+:: {
@@ -663,8 +662,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') +
Sometimes in small clusters, the CPU/memory limits can get high enough for alerts to be fired continuously. To prevent this, one can strip off the predefined limits.
To do that, one can import the following mixin
[embedmd]:# (examples/strip-limits.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/strip-limits.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') +
(import 'kube-prometheus/addons/strip-limits.libsonnet') + {
values+:: {
@@ -758,11 +756,11 @@ resources. One driver for more resource needs, is a high number of
namespaces. There may be others.
kube-state-metrics resource allocation is managed by
[addon-resizer](https://github.com/kubernetes/autoscaler/tree/main/addon-resizer/nanny)
[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny)
You can control it's parameters by setting variables in the
config. They default to:
``` jsonnet
```jsonnet
kubeStateMetrics+:: {
baseCPU: '100m',
cpuPerNode: '2m',
@@ -772,11 +770,12 @@ config. They default to:
```
### Error retrieving kube-proxy metrics
By default, kubeadm will configure kube-proxy to listen on 127.0.0.1 for metrics. Because of this prometheus would not be able to scrape these metrics. This would have to be changed to 0.0.0.0 in one of the following two places:
1. Before cluster initialization, the config file passed to kubeadm init should have KubeProxyConfiguration manifest with the field metricsBindAddress set to 0.0.0.0:10249
2. If the k8s cluster is already up and running, we'll have to modify the configmap kube-proxy in the namespace kube-system and set the metricsBindAddress field. After this kube-proxy daemonset would have to be restarted with
`kubectl -n kube-system rollout restart daemonset kube-proxy`
`kubectl -n kube-system rollout restart daemonset kube-proxy`
## Contributing
@@ -788,8 +787,8 @@ the following process:
2. Commit your changes (This is currently necessary due to our vendoring
process. This is likely to change in the future).
3. Update the pinned kube-prometheus dependency in `jsonnetfile.lock.json`: `jb update`
3. Generate dependent `*.yaml` files: `make generate`
4. Commit the generated changes.
4. Generate dependent `*.yaml` files: `make generate`
5. Commit the generated changes.
## License

120
RELEASE.md Normal file
View File

@@ -0,0 +1,120 @@
# Release schedule
Kube-prometheus has a somehow predictable release schedule, releases were
historically cut in sync with OpenShift releases as per downstream needs. So
far there hasn't been any problem with this schedule since it is also in sync
with Kubernetes releases. So for every new Kubernetes release, there is a new
release of kube-prometheus, although it tends to happen later.
# How to cut a new release
> This guide is strongly based on the [prometheus-operator release
> instructions](https://github.com/prometheus-operator/prometheus-operator/blob/master/RELEASE.md).
## Branch management and versioning strategy
We use [Semantic Versioning](http://semver.org/).
We maintain a separate branch for each minor release, named
`release-<major>.<minor>`, e.g. `release-1.1`, `release-2.0`.
The usual flow is to merge new features and changes into the master branch and
to merge bug fixes into the latest release branch. Bug fixes are then merged
into master from the latest release branch. The master branch should always
contain all commits from the latest release branch.
If a bug fix got accidentally merged into master, cherry-pick commits have to be
created in the latest release branch, which then has to be merged back into
master. Try to avoid that situation.
Maintaining the release branches for older minor releases happens on a best
effort basis.
## Cut a release of kubernetes-mixins
kube-prometheus and kubernetes-mixins releases are tied, so before cutting the
release of kube-prometheus we should make sure that the same release of
kubernetes-mixins exists.
## Update components version
Every release of kube-prometheus should include the latest versions of each
component. Updating them is automated via a CI job that can be triggered
manually from this
[workflow](https://github.com/prometheus-operator/kube-prometheus/actions/workflows/versions.yaml).
Once the workflow is completed, the prometheus-operator bot will create some
PRs. You should merge the one prefixed by `[bot][main]` if created before
proceeding. If the bot didn't create the PR, it is either because the workflow
failed or because the main branch was already up-to-date.
## Update Kubernetes supported versions
The main branch of kube-prometheus should support the last 2 versions of
Kubernetes. We need to make sure that the CI on the main branch is testing the
kube-prometheus configuration against both of these versions by updating the [CI
worklow](.github/workflows/ci.yaml) to include the latest kind version and the
2 latest images versions that are attached to the kind release. Once that is
done, the [compatibility matrix](README.md#kubernetes-compatibility-matrix) in
the README should also be updated to reflect the CI changes.
## Create pull request to cut the release
### Pin Jsonnet dependencies
Pin jsonnet dependencies in
[jsonnetfile.json](jsonnet/kube-prometheus/jsonnetfile.json). Each dependency
should be pinned to the latest release branch or if it doesn't have one, pinned
to the latest commit.
### Start with a fresh environment
```bash
make clean
```
### Update Jsonnet dependencies
```bash
make update
```
### Generate manifests
```bash
make generate
```
### Update the compatibility matrix
Update the [compatibility matrix](README.md#kubernetes-compatibility-matrix) in
the README, by adding the new release based on the `main` branch compatibility
and removing the oldest release branch to only keep the latest 5 branches in the
matrix.
### Update changelog
Iterate over the PRs that were merged between the latest release of kube-prometheus and the HEAD and add the changelog entries to the [CHANGELOG](CHANGELOG.md).
## Create release branch
Once the PR cutting the release is merged, pull the changes, create a new
release branch named `release-x.y` based on the latest changes and push it to
the upstream repository.
## Create follow-up pull request
### Unpin Jsonnet dependencies
Revert previous changes made when pinning the jsonnet dependencies since we want
the main branch to be in sync with the latest changes of its dependencies.
### Update CI workflow
Update the [versions workflow](.github/workflows/versions.yaml) to include the latest release branch and remove the oldest one to reflect the list of supported releases.
### Update Kubernetes versions used by kubeconform
Update the versions of Kubernetes used when validating manifests with
kubeconform in the [Makefile](Makefile) to align with the compatibility
matrix.

View File

@@ -33,8 +33,8 @@ This code of conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community.
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting a project maintainer listed in
https://github.com/prometheus-operator/prometheus-operator/blob/master/MAINTAINERS.md.
reported by contacting a project maintainer listed in
https://github.com/prometheus-operator/prometheus-operator/blob/master/MAINTAINERS.md.
This Code of Conduct is adapted from the Contributor Covenant
(http://contributor-covenant.org), version 1.2.0, available at

View File

@@ -0,0 +1,33 @@
# Ephemeral developer workspaces
Aiming to provide better developer experience when making contributions to kube-prometheus, whether by actively developing new features/bug fixes or by reviewing pull requests, we want to provide ephemeral developer workspaces with everything already configured (as far as tooling makes it possible).
Those developer workspaces should provide a brand new kubernetes cluster, where kube-prometheus can be easily deployed and the contributor can easily see the impact that a pull request is proposing.
Today there is 2 providers in the market:
* [Github Codespaces](https://github.com/features/codespaces)
* [Gitpod](https://www.gitpod.io/)
## Codespaces
Unfortunately, Codespaces is not available for everyone. If you are fortunate to have access to it, you can open a new workspace from a specific branch, or even from Pull Requests.
![image](https://user-images.githubusercontent.com/24193764/135522435-44b177b4-00d4-4863-b45b-2db47c8c70d0.png)
![image](https://user-images.githubusercontent.com/24193764/135522560-c64968ab-3b4e-4639-893a-c4d0a14421aa.png)
After your workspace start, you can deploy a kube-prometheus inside a Kind cluster inside by running `make deploy`.
If you are reviewing a PR, you'll have a fully-functional kubernetes cluster, generating real monitoring data that can be used to review if the proposed changes works as described.
If you are working on new features/bug fixes, you can regenerate kube-prometheus's YAML manifests with `make generate` and deploy it again with `make deploy`.
## Gitpod
Gitpod is already available to everyone to use for free. It can also run commands that we speficy in the `.gitpod.yml` file located in the root directory of the git repository, so even the cluster creation can be fully automated.
You can use the same workflow as mentioned in the [Codespaces](#codespaces) section, however Gitpod doesn't have native support for any kubernetes distribution. The workaround is to create a full QEMU Virtual Machine and deploy [k3s](https://github.com/k3s-io/k3s) inside this VM. Don't worry, this whole process is already fully automated, but due to the workaround the whole workspace may be very slow.
To open up a workspace with Gitpod, you can install the [Google Chrome extension](https://www.gitpod.io/docs/browser-extension/) to add a new button to Github UI and use it on PRs or from the main page. Or by directly typing in the browser `http://gitpod.io/#https://github.com/prometheus-operator/kube-prometheus/pull/<Pull Request Number>` or just `http://gitpod.io/#https://github.com/prometheus-operator/kube-prometheus`
![image](https://user-images.githubusercontent.com/24193764/135534546-4f6bf0e5-57cd-4e35-ad80-88bd47d64276.png)

View File

@@ -0,0 +1,20 @@
#!/bin/bash
which kind
if [[ $? != 0 ]]; then
echo 'kind not available in $PATH, installing latest kind'
# Install latest kind
curl -s https://api.github.com/repos/kubernetes-sigs/kind/releases/latest \
| grep "browser_download_url.*kind-linux-amd64" \
| cut -d : -f 2,3 \
| tr -d \" \
| wget -qi -
mv kind-linux-amd64 kind && chmod +x kind
fi
cluster_created=$($PWD/kind get clusters 2>&1)
if [[ "$cluster_created" == "No kind clusters found." ]]; then
$PWD/kind create cluster
else
echo "Cluster '$cluster_created' already present"
fi

View File

@@ -1,9 +1,13 @@
#!/bin/bash
kubectl apply -f manifests/setup
# Safety wait for CRDs to be working
sleep 30
kubectl apply -f manifests/
sleep 30
# Safety wait for resources to be created
kubectl rollout status -n monitoring daemonset node-exporter
kubectl rollout status -n monitoring statefulset alertmanager-main

View File

@@ -4,9 +4,9 @@ AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin
One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling).
You can monitor the `awscni` using kube-promethus with :
[embedmd]:# (../examples/eks-cni-example.jsonnet)
```jsonnet
You can monitor the `awscni` using kube-promethus with :
```jsonnet mdox-exec="cat examples/eks-cni-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {

View File

@@ -5,6 +5,7 @@ authentication. Until it does, Prometheus must use HTTP (not HTTPS)
for scraping.
You can configure this behavior through kube-prometheus with:
```
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet') +

View File

@@ -1,16 +1,16 @@
---
title: "Blackbox Exporter"
description: "Generated API docs for the Prometheus Operator"
lead: "This Document documents the types introduced by the Prometheus Operator to be consumed by users."
date: 2021-03-08T08:49:31+00:00
lastmod: 2021-03-08T08:49:31+00:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 630
toc: true
title: Blackbox Exporter
menu:
docs:
parent: kube
lead: This Document documents the types introduced by the Prometheus Operator to be consumed by users.
lastmod: "2021-03-08T08:49:31+00:00"
images: []
draft: false
description: Generated API docs for the Prometheus Operator
date: "2021-03-08T08:49:31+00:00"
---
# Setting up a blackbox exporter
@@ -21,6 +21,7 @@ The `prometheus-operator` defines a `Probe` resource type that can be used to de
1. Override blackbox-related configuration parameters as needed.
2. Add the following to the list of renderers to render the blackbox exporter manifests:
```
{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) }
```

View File

@@ -4,7 +4,7 @@ For bugs, you can use the GitHub [issue tracker](https://github.com/prometheus-o
For questions, you can use the GitHub [discussions forum](https://github.com/prometheus-operator/kube-prometheus/discussions).
Many of the `kube-prometheus` project's contributors and users can also be found on the #prometheus-operator channel of the [Kubernetes Slack][Kubernetes Slack].
Many of the `kube-prometheus` project's contributors and users can also be found on the #prometheus-operator channel of the [Kubernetes Slack](https://slack.k8s.io/).
`kube-prometheus` is the aggregation of many projects that all have different
channels to reach out for help and support. This community strives at
@@ -18,7 +18,7 @@ if applicable.
For documentation, check the project's [documentation directory](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation).
For questions, use the #prometheus-operator channel on the [Kubernetes Slack][Kubernetes Slack].
For questions, use the #prometheus-operator channel on the [Kubernetes Slack](https://slack.k8s.io/).
For bugs, use the GitHub [issue tracker](https://github.com/prometheus-operator/prometheus-operator/issues/new/choose).
@@ -26,19 +26,19 @@ For bugs, use the GitHub [issue tracker](https://github.com/prometheus-operator/
For documentation, check the Prometheus [online docs](https://prometheus.io/docs/). There is a
[section](https://prometheus.io/docs/introduction/media/) with links to blog
posts, recorded talks and presentations. This [repository](https://github.com/roaldnefs/awesome-prometheus)
posts, recorded talks and presentations. This [repository](https://github.com/roaldnefs/awesome-prometheus)
(not affiliated to the Prometheus project) has also a list of curated resources
related to the Prometheus ecosystem.
For questions, see the Prometheus [community page](https://prometheus.io/community/) for the various channels.
There is also a #prometheus channel on the [CNCF Slack][CNCF Slack].
There is also a #prometheus channel on the [CNCF Slack](https://slack.cncf.io/).
## kube-state-metrics
For documentation, see the project's [docs directory](https://github.com/kubernetes/kube-state-metrics/tree/master/docs).
For questions, use the #kube-state-metrics channel on the [Kubernetes Slack][Kubernetes Slack].
For questions, use the #kube-state-metrics channel on the [Kubernetes Slack](https://slack.k8s.io/).
For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kube-state-metrics/issues/new/choose).
@@ -46,7 +46,7 @@ For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kube-stat
For documentation, check the [Kubernetes docs](https://kubernetes.io/docs/home/).
For questions, use the [community forums](https://discuss.kubernetes.io/) and the [Kubernetes Slack][Kubernetes Slack]. Check also the [community page](https://kubernetes.io/community/#discuss).
For questions, use the [community forums](https://discuss.kubernetes.io/) and the [Kubernetes Slack](https://slack.k8s.io/). Check also the [community page](https://kubernetes.io/community/#discuss).
For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kubernetes/issues/new/choose).
@@ -54,7 +54,7 @@ For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kubernete
For documentation, check the project's [README](https://github.com/DirectXMan12/k8s-prometheus-adapter/blob/master/README.md).
For questions, use the #sig-instrumentation channel on the [Kubernetes Slack][Kubernetes Slack].
For questions, use the #sig-instrumentation channel on the [Kubernetes Slack](https://slack.k8s.io/).
For bugs, use the GitHub [issue tracker](https://github.com/DirectXMan12/k8s-prometheus-adapter/issues/new).
@@ -70,7 +70,7 @@ For bugs, use the GitHub [issue tracker](https://github.com/grafana/grafana/issu
For documentation, check the project's [README](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/README.md).
For questions, use #monitoring-mixins channel on the [Kubernetes Slack][Kubernetes Slack].
For questions, use #monitoring-mixins channel on the [Kubernetes Slack](https://slack.k8s.io/).
For bugs, use the GitHub [issue tracker](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/new).
@@ -79,6 +79,3 @@ For bugs, use the GitHub [issue tracker](https://github.com/kubernetes-monitorin
For documentation, check the [Jsonnet](https://jsonnet.org/) website.
For questions, use the [mailing list](https://groups.google.com/forum/#!forum/jsonnet).
[Kubernetes Slack]: https://slack.k8s.io/
[CNCF Slack]: https://slack.cncf.io/

View File

@@ -1,15 +1,15 @@
---
title: "Deploy to kind"
description: "Deploy kube-prometheus to Kubernets kind."
lead: "Deploy kube-prometheus to Kubernets kind."
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 500
toc: true
title: Deploy to kind
menu:
docs:
parent: kube
lead: Deploy kube-prometheus to Kubernets kind.
images: []
draft: false
description: Deploy kube-prometheus to Kubernets kind.
date: "2021-03-08T23:04:32+01:00"
---
---

View File

@@ -1,15 +1,15 @@
---
title: "Prometheus Rules and Grafana Dashboards"
description: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus"
lead: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus"
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 650
toc: true
title: Prometheus Rules and Grafana Dashboards
menu:
docs:
parent: kube
lead: Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus
images: []
draft: false
description: Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus
date: "2021-03-08T23:04:32+01:00"
---
`kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this.
@@ -20,8 +20,7 @@ For both the Prometheus rules and the Grafana dashboards Kubernetes `ConfigMap`s
As a basis, all examples in this guide are based on the base example of the kube-prometheus [readme](../README.md):
[embedmd]:# (../example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat example.jsonnet"
local kp =
(import 'kube-prometheus/main.libsonnet') +
// Uncomment the following imports to enable its patches
@@ -68,8 +67,7 @@ The format is exactly the Prometheus format, so there should be no changes neces
> Note that alerts can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file.
[embedmd]:# (../examples/prometheus-additional-alert-rule-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/prometheus-additional-alert-rule-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
@@ -124,8 +122,7 @@ In order to add a recording rule, simply do the same with the `prometheusRules`
> Note that rules can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file.
[embedmd]:# (../examples/prometheus-additional-recording-rule-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/prometheus-additional-recording-rule-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
@@ -184,8 +181,7 @@ cat existingrule.yaml | gojsontoyaml -yamltojson > existingrule.json
Then import it in jsonnet:
[embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/prometheus-additional-rendered-rule-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+: {
@@ -217,6 +213,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +
{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) }
```
### Changing default rules
Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/main.libsonnet`. The recording rules can be found in [kube-prometheus/components/mixin/rules](../jsonnet/kube-prometheus/components/mixin/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/components/mixin/alerts](../jsonnet/kube-prometheus/components/mixin/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts).
@@ -224,7 +221,9 @@ Along with adding additional rules, we give the user the option to filter or adj
Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above.
#### Filter
Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). You first need to find out in which component the rule is defined (here it is kuberentesControlPlane).
```jsonnet
local filter = {
kubernetesControlPlane+: {
@@ -251,7 +250,9 @@ local filter = {
```
#### Adjustment
Here the expression for another alert in the same component is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet).
```jsonnet
local update = {
kubernetesControlPlane+: {
@@ -283,6 +284,7 @@ local update = {
```
Using the example from above about adding in pre-rendered rules, the new local variables can be added in as follows:
```jsonnet
local add = {
exampleApplication:: {
@@ -327,6 +329,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') +
{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +
{ ['exampleApplication-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) }
```
## Dashboards
Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard.
@@ -337,8 +340,7 @@ We recommend using the [grafonnet](https://github.com/grafana/grafonnet-lib/) li
> Note that dashboards can just as well be included into this file, using the jsonnet `import` function. In this example it is just inlined in order to demonstrate their use in a single file.
[embedmd]:# (../examples/grafana-additional-jsonnet-dashboard-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/grafana-additional-jsonnet-dashboard-example.jsonnet"
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
@@ -394,8 +396,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
As jsonnet is a superset of json, the jsonnet `import` function can be used to include Grafana dashboard json blobs. In this example we are importing a [provided example dashboard](../examples/example-grafana-dashboard.json).
[embedmd]:# (../examples/grafana-additional-rendered-dashboard-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/grafana-additional-rendered-dashboard-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+:: {
@@ -419,8 +420,8 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
```
In case you have lots of json dashboard exported out from grafana UI the above approach is going to take lots of time to improve performance we can use `rawDashboards` field and provide it's value as json string by using `importstr`
[embedmd]:# (../examples/grafana-additional-rendered-dashboard-example-2.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/grafana-additional-rendered-dashboard-example-2.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') + {
values+:: {
common+:: {
@@ -523,8 +524,7 @@ values+:: {
Full example of including etcd mixin using method described above:
[embedmd]:# (../examples/mixin-inclusion.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/mixin-inclusion.jsonnet"
local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet');
local etcdMixin = addMixin({
name: 'etcd',

View File

@@ -1,15 +1,15 @@
---
title: "Expose via Ingress"
description: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana."
lead: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana."
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 500
toc: true
title: Expose via Ingress
menu:
docs:
parent: kube
lead: How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana.
images: []
draft: false
description: How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana.
date: "2021-03-08T23:04:32+01:00"
---
In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project.
@@ -104,7 +104,7 @@ k.core.v1.list.new([
In order to expose Alertmanager and Grafana, simply create additional fields containing an ingress object, but simply pointing at the `alertmanager` or `grafana` instead of the `prometheus-k8s` Service. Make sure to also use the correct port respectively, for Alertmanager it is also `web`, for Grafana it is `http`. Be sure to also specify the appropriate external URL. Note that the external URL for grafana is set in a different way than the external URL for Prometheus or Alertmanager. See [ingress.jsonnet](../examples/ingress.jsonnet) for how to set the Grafana external URL.
In order to render the ingress objects similar to the other objects use as demonstrated in the [main readme](../README.md#usage):
In order to render the ingress objects similar to the other objects use as demonstrated in the [main readme](../README.md):
```
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } +

View File

@@ -1,15 +1,15 @@
---
title: "Deploy to kubeadm"
description: "Deploy kube-prometheus to Kubernets kubeadm."
lead: "Deploy kube-prometheus to Kubernets kubeadm."
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 500
toc: true
title: Deploy to kubeadm
menu:
docs:
parent: kube
lead: Deploy kube-prometheus to Kubernets kubeadm.
images: []
draft: false
description: Deploy kube-prometheus to Kubernets kubeadm.
date: "2021-03-08T23:04:32+01:00"
---
The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with kubeadm.
@@ -93,7 +93,6 @@ Once you complete this guide you will monitor the following:
* kube-scheduler
* kube-controller-manager
## Getting Up and Running Fast with Kube-Prometheus
To help get started more quickly with monitoring Kubernetes clusters, [kube-prometheus](https://github.com/coreos/kube-prometheus) was created. It is a collection of manifests including dashboards and alerting rules that can easily be deployed. It utilizes the Prometheus Operator and all the manifests demonstrated in this guide.

View File

@@ -2,9 +2,9 @@
An example conversion of a legacy custom jsonnet file to release-0.8
format can be seen by viewing and comparing this
[release-0.3 jsonnet file](./my.release-0.3.jsonnet) (when the github
[release-0.3 jsonnet file](my.release-0.3.jsonnet) (when the github
repo was under `https://github.com/coreos/kube-prometheus...`)
and the corresponding [release-0.8 jsonnet file](./my.release-0.8.jsonnet).
and the corresponding [release-0.8 jsonnet file](my.release-0.8.jsonnet).
These two files have had necessary blank lines added so that they
can be compared side-by-side and line-by-line on screen.
@@ -16,8 +16,9 @@ release-0.3 and also the major migration after release-0.7 as described in
The sample files are intended as an example of format conversion and
not necessarily best practice for the files in release-0.3 or release-0.8.
Below are three sample extracts of the conversion as an indication of the
Below are three sample extracts of the conversion as an indication of the
changes required.
<table>
<tr>
<th> release-0.3 </th>

View File

@@ -33,14 +33,14 @@ Thanks to our community we identified a lot of short-commings of previous design
Those concepts were already present in the repository but it wasn't clear which file is holding what. After refactoring we categorized jsonnet code into 3 buckets and put them into separate directories:
- `components` - main building blocks for kube-prometheus, written as functions responsible for creating multiple objects representing kubernetes manifests. For example all objects for node_exporter deployment are bundled in `components/node_exporter.libsonnet` library
- `addons` - everything that can enhance kube-prometheus deployment. Those are small snippets of code adding a small feature, for example adding anti-affinity to pods via [`addons/anti-affinity.libsonnet`][antiaffinity]. Addons are meant to be used in object-oriented way like `local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet')`
- `addons` - everything that can enhance kube-prometheus deployment. Those are small snippets of code adding a small feature, for example adding anti-affinity to pods via [`addons/anti-affinity.libsonnet`](https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet). Addons are meant to be used in object-oriented way like `local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet')`
- `platforms` - currently those are `addons` specialized to allow deploying kube-prometheus project on a specific platform.
### Component configuration
Refactoring main components to use functions allowed us to define APIs for said components. Each function has a default set of parameters that can be overridden or that are required to be set by a user. Those default parameters are represented in each component by `defaults` map at the top of each library file, for example in [`node_exporter.libsonnet`][node_exporter_defaults_example].
Refactoring main components to use functions allowed us to define APIs for said components. Each function has a default set of parameters that can be overridden or that are required to be set by a user. Those default parameters are represented in each component by `defaults` map at the top of each library file, for example in [`node_exporter.libsonnet`](https://github.com/prometheus-operator/kube-prometheus/blob/1d2a0e275af97948667777739a18b24464480dc8/jsonnet/kube-prometheus/components/node-exporter.libsonnet#L3-L34).
This API is meant to ease the use of kube-prometheus as parameters can be passed from a JSON file and don't need to be in jsonnet format. However, if you need to modify particular parts of the stack, jsonnet allows you to do this and we are also not restricting such access in any way. An example of such modifications can be seen in any of our `addons`, like the [`addons/anti-affinity.libsonnet`][antiaffinity] one.
This API is meant to ease the use of kube-prometheus as parameters can be passed from a JSON file and don't need to be in jsonnet format. However, if you need to modify particular parts of the stack, jsonnet allows you to do this and we are also not restricting such access in any way. An example of such modifications can be seen in any of our `addons`, like the [`addons/anti-affinity.libsonnet`](https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet) one.
### Mixin integration
@@ -63,25 +63,14 @@ All examples from `examples/` directory were adapted to the new codebase. [Pleas
## Legacy migration
An example of conversion of a legacy release-0.3 my.jsonnet file to release-0.8 can be found in [migration-example](./migration-example)
An example of conversion of a legacy release-0.3 my.jsonnet file to release-0.8 can be found in [migration-example](migration-example)
## Advanced usage examples
For more advanced usage examples you can take a look at those two, open to public, implementations:
- [thaum-xyz/ankhmorpork][thaum] - extending kube-prometheus to adapt to a required environment
- [openshift/cluster-monitoring-operator][openshift] - using kube-prometheus components as standalone libraries to build a custom solution
- [thaum-xyz/ankhmorpork](https://github.com/thaum-xyz/ankhmorpork/blob/master/apps/monitoring/jsonnet) - extending kube-prometheus to adapt to a required environment
- [openshift/cluster-monitoring-operator](https://github.com/openshift/cluster-monitoring-operator/pull/1044) - using kube-prometheus components as standalone libraries to build a custom solution
## Final note
Refactoring was a huge undertaking and possibly this document didn't describe in enough detail how to help you with migration to the new stack. If that is the case, please reach out to us by using [GitHub discussions][discussions] feature or directly on [#prometheus-operator kubernetes slack channel][slack].
[antiaffinity]: https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet
[node_exporter_defaults_example]: https://github.com/prometheus-operator/kube-prometheus/blob/1d2a0e275af97948667777739a18b24464480dc8/jsonnet/kube-prometheus/components/node-exporter.libsonnet#L3-L34
[openshift]: https://github.com/openshift/cluster-monitoring-operator/pull/1044
[thaum]: https://github.com/thaum-xyz/ankhmorpork/blob/master/apps/monitoring/jsonnet
[discussions]: https://github.com/prometheus-operator/kube-prometheus/discussions
[slack]: http://slack.k8s.io/
Refactoring was a huge undertaking and possibly this document didn't describe in enough detail how to help you with migration to the new stack. If that is the case, please reach out to us by using [GitHub discussions](https://github.com/prometheus-operator/kube-prometheus/discussions) feature or directly on [#prometheus-operator kubernetes slack channel](http://slack.k8s.io/).

View File

@@ -1,23 +1,23 @@
---
title: "Monitoring external etcd"
description: "This guide will help you monitor an external etcd cluster."
lead: "This guide will help you monitor an external etcd cluster."
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 640
toc: true
title: Monitoring external etcd
menu:
docs:
parent: kube
lead: This guide will help you monitor an external etcd cluster.
images: []
draft: false
description: This guide will help you monitor an external etcd cluster.
date: "2021-03-08T23:04:32+01:00"
---
When the etcd cluster is not hosted inside Kubernetes.
This is often the case with Kubernetes setups. This approach has been tested with kube-aws but the same principals apply to other tools.
Note that [etcd.jsonnet](../examples/etcd.jsonnet) & [kube-prometheus-static-etcd.libsonnet](../jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) (which are described by a section of the [Readme](../README.md#static-etcd-configuration)) do the following:
* Put the three etcd TLS client files (CA & cert & key) into a secret in the namespace, and have Prometheus Operator load the secret.
* Create the following (to expose etcd metrics - port 2379): a Service, Endpoint, & ServiceMonitor.
Note that [etcd.jsonnet](../examples/etcd.jsonnet) & [static-etcd.libsonnet](../jsonnet/kube-prometheus/addons/static-etcd.libsonnet) (which are described by a section of the [Readme](../README.md#static-etcd-configuration)) do the following:
* Put the three etcd TLS client files (CA & cert & key) into a secret in the namespace, and have Prometheus Operator load the secret.
* Create the following (to expose etcd metrics - port 2379): a Service, Endpoint, & ServiceMonitor.
# Step 1: Open the port
@@ -26,6 +26,7 @@ You now need to allow the nodes Prometheus are running on to talk to the etcd on
If using kube-aws, you will need to edit the etcd security group inbound, specifying the security group of your Kubernetes node (worker) as the source.
## kube-aws and EIP or ENI inconsistency
With kube-aws, each etcd node has two IP addresses:
* EC2 instance IP
@@ -40,6 +41,7 @@ Another idea woud be to use the DNS entries of etcd, but those are not currently
# Step 2: verify
Go to the Prometheus UI on :9090/config and check that you have an etcd job entry:
```
- job_name: monitoring/etcd-k8s/0
scrape_interval: 30s
@@ -48,6 +50,5 @@ Go to the Prometheus UI on :9090/config and check that you have an etcd job entr
```
On the :9090/targets page:
* You should see "etcd" with the UP state. If not, check the Error column for more information.
* If no "etcd" targets are even shown on this page, prometheus isn't attempting to scrape it.
* You should see "etcd" with the UP state. If not, check the Error column for more information.
* If no "etcd" targets are even shown on this page, prometheus isn't attempting to scrape it.

View File

@@ -1,24 +1,26 @@
---
title: "Monitoring other Namespaces"
description: "This guide will help you monitor applications in other Namespaces."
lead: "This guide will help you monitor applications in other Namespaces."
date: 2021-03-08T23:04:32+01:00
draft: false
images: []
menu:
docs:
parent: "kube"
weight: 640
toc: true
title: Monitoring other Namespaces
menu:
docs:
parent: kube
lead: This guide will help you monitor applications in other Namespaces.
images: []
draft: false
description: This guide will help you monitor applications in other Namespaces.
date: "2021-03-08T23:04:32+01:00"
---
This guide will help you monitor applications in other Namespaces. By default the RBAC rules are only enabled for the `Default` and `kube-system` Namespace during Install.
# Setup
You have to give the list of the Namespaces that you want to be able to monitor.
This is done in the variable `prometheus.roleSpecificNamespaces`. You usually set this in your `.jsonnet` file when building the manifests.
Example to create the needed `Role` and `RoleBinding` for the Namespace `foo` :
Example to create the needed `Role` and `RoleBinding` for the Namespace `foo` :
```
local kp = (import 'kube-prometheus/main.libsonnet') + {
_config+:: {

View File

@@ -1,9 +1,11 @@
# Setup Weave Net monitoring using kube-prometheus
[Weave Net](https://kubernetes.io/docs/concepts/cluster-administration/networking/#weave-net-from-weaveworks) is a resilient and simple to use CNI provider for Kubernetes. A well monitored and observed CNI provider helps in troubleshooting Kubernetes networking problems. [Weave Net](https://www.weave.works/docs/net/latest/concepts/how-it-works/) emits [prometheus metrics](https://www.weave.works/docs/net/latest/tasks/manage/metrics/) for monitoring Weave Net. There are many ways to install Weave Net in your cluster. One of them is using [kops](https://github.com/kubernetes/kops/blob/master/docs/networking.md).
Following this document, you can setup Weave Net monitoring for your cluster using kube-prometheus.
## Contents
Using kube-prometheus and kubectl you will be able install the following for monitoring Weave Net in your cluster:
1. [Service for Weave Net](https://gist.github.com/alok87/379c6234b582f555c141f6fddea9fbce) The service which the [service monitor](https://coreos.com/operators/prometheus/docs/latest/user-guides/cluster-monitoring.html) scrapes.
@@ -15,8 +17,7 @@ Using kube-prometheus and kubectl you will be able install the following for mon
## Instructions
- You can monitor Weave Net using an example like below. **Please note that some alert configurations are environment specific and may require modifications of alert thresholds**. For example: The FastDP flows have never gone below 15000 for us. But if this value is say 20000 for you then you can use an example like below to update the alert. The alerts which may require threshold modifications are `WeaveNetFastDPFlowsLow` and `WeaveNetIPAMUnreachable`.
[embedmd]:# (../examples/weave-net-example.jsonnet)
```jsonnet
```jsonnet mdox-exec="cat examples/weave-net-example.jsonnet"
local kp = (import 'kube-prometheus/main.libsonnet') +
(import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + {
values+:: {
@@ -66,6 +67,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') +
```
- After you have the required yamls file please run
```
kubectl create -f prometheus-serviceWeaveNet.yaml
kubectl create -f prometheus-serviceMonitorWeaveNet.yaml

View File

@@ -1,11 +1,10 @@
# Windows
The [Windows addon](../examples/windows.jsonnet) adds the dashboards and rules from [kubernetes-monitoring/kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin#dashboards-for-windows-nodes).
The [Windows addon](../examples/windows.jsonnet) adds the dashboards and rules from [kubernetes-monitoring/kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin#dashboards-for-windows-nodes).
Currently, Windows does not support running with [windows_exporter](https://github.com/prometheus-community/windows_exporter) in a pod so this add on uses [additional scrape configuration](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/additional-scrape-config.md) to set up a static config to scrape the node ports where windows_exporter is configured.
The addon requires you to specify the node ips and ports where it can find the windows_exporter. See the [full example](../examples/windows.jsonnet) for setup.
The addon requires you to specify the node ips and ports where it can find the windows_exporter. See the [full example](../examples/windows.jsonnet) for setup.
```
local kp = (import 'kube-prometheus/main.libsonnet') +

View File

@@ -24,7 +24,7 @@ local kp = (import 'kube-prometheus/main.libsonnet') + {
],
selector: {
matchLabels: {
app: 'myapp',
'app.kubernetes.io/name': 'myapp',
},
},
},

View File

@@ -19,4 +19,4 @@ spec:
- logging
selector:
matchLabels:
app: myapp
app.kubernetes.io/name: myapp

View File

@@ -7,7 +7,7 @@ metadata:
namespace: default
spec:
selector:
app: example-app
app.kubernetes.io/name: example-app
ports:
- name: web
protocol: TCP
@@ -22,17 +22,17 @@ metadata:
spec:
selector:
matchLabels:
app: example-app
app.kubernetes.io/name: example-app
version: 1.1.3
replicas: 4
template:
metadata:
labels:
app: example-app
app.kubernetes.io/name: example-app
version: 1.1.3
spec:
containers:
- name: example-app
- name: example-app
image: quay.io/fabxc/prometheus_demo_service
ports:
- name: web

View File

@@ -0,0 +1,36 @@
local kp =
(import 'kube-prometheus/main.libsonnet') +
{
values+:: {
common+: {
namespace: 'monitoring',
},
grafana+: {
config+: {
sections: {
'auth.ldap': {
enabled: true,
config_file: '/etc/grafana/ldap.toml',
allow_sign_up: true,
},
},
},
ldap: |||
[[servers]]
host = "127.0.0.1"
port = 389
use_ssl = false
start_tls = false
ssl_skip_verify = false
bind_dn = "cn=admins,dc=example,dc=com"
bind_password = 'grafana'
search_filter = "(cn=%s)"
search_base_dns = ["dc=example,dc=com"]
|||,
},
},
};
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }

View File

@@ -31,6 +31,10 @@ local withImageRepository(repository) = {
},
};
{
imageName:: imageName,
}
{
withImageRepository:: withImageRepository,
}

View File

@@ -81,7 +81,7 @@
},
deployment:
local podLabels = { app: 'ksm-autoscaler' };
local podLabels = { 'app.kubernetes.io/name': 'ksm-autoscaler' };
local c = {
name: 'ksm-autoscaler',
image: $.values.clusterVerticalAutoscaler.image,

View File

@@ -19,6 +19,7 @@ local defaults = {
if !std.setMember(labelName, ['app.kubernetes.io/version'])
},
name: error 'must provide name',
reloaderPort: 8080,
config: {
global: {
resolve_timeout: '5m',
@@ -136,9 +137,9 @@ function(params) {
spec: {
ports: [
{ name: 'web', targetPort: 'web', port: 9093 },
{ name: 'reloader-web', port: am._config.reloaderPort, targetPort: 'reloader-web' },
],
selector: {
app: 'alertmanager',
alertmanager: am._config.name,
} + am._config.selectorLabels,
sessionAffinity: 'ClientIP',
@@ -161,12 +162,13 @@ function(params) {
},
endpoints: [
{ port: 'web', interval: '30s' },
{ port: 'reloader-web', interval: '30s' },
],
},
},
[if (defaults + params).replicas > 1 then 'podDisruptionBudget']: {
apiVersion: 'policy/v1beta1',
apiVersion: 'policy/v1',
kind: 'PodDisruptionBudget',
metadata: {
name: 'alertmanager-' + am._config.name,

View File

@@ -1,3 +1,5 @@
local kubernetesGrafana = import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet';
local defaults = {
local defaults = self,
name: 'grafana',
@@ -20,86 +22,35 @@ local defaults = {
if !std.setMember(labelName, ['app.kubernetes.io/version'])
},
prometheusName: error 'must provide prometheus name',
dashboards: {},
// TODO(paulfantom): expose those to have a stable API. After kubernetes-grafana refactor those could probably be removed.
rawDashboards: {},
folderDashboards: {},
containers: [],
datasources: [],
config: {},
plugins: [],
env: [],
};
function(params) {
local g = self,
_config:: defaults + params,
function(params)
local config = defaults + params;
// Safety check
assert std.isObject(g._config.resources),
assert std.isObject(config.resources);
local glib = (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + {
_config+:: {
namespace: g._config.namespace,
versions+:: {
grafana: g._config.version,
},
imageRepos+:: {
grafana: std.split(g._config.image, ':')[0],
},
prometheus+:: {
name: g._config.prometheusName,
},
grafana+:: {
kubernetesGrafana(config) {
local g = self,
_config+:: config,
serviceMonitor: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'grafana',
namespace: g._config.namespace,
labels: g._config.commonLabels,
dashboards: g._config.dashboards,
resources: g._config.resources,
rawDashboards: g._config.rawDashboards,
folderDashboards: g._config.folderDashboards,
containers: g._config.containers,
config+: g._config.config,
plugins+: g._config.plugins,
env: g._config.env,
} + (
// Conditionally overwrite default setting.
if std.length(g._config.datasources) > 0 then
{ datasources: g._config.datasources }
else {}
),
},
},
config: glib.grafana.config,
service: glib.grafana.service,
serviceAccount: glib.grafana.serviceAccount,
deployment: glib.grafana.deployment,
dashboardDatasources: glib.grafana.dashboardDatasources,
dashboardSources: glib.grafana.dashboardSources,
dashboardDefinitions: if std.length(g._config.dashboards) > 0 ||
std.length(g._config.rawDashboards) > 0 ||
std.length(g._config.folderDashboards) > 0 then {
apiVersion: 'v1',
kind: 'ConfigMapList',
items: glib.grafana.dashboardDefinitions,
},
serviceMonitor: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'grafana',
namespace: g._config.namespace,
labels: g._config.commonLabels,
},
spec: {
selector: {
matchLabels: {
'app.kubernetes.io/name': 'grafana',
},
},
endpoints: [{
port: 'http',
interval: '15s',
}],
spec: {
selector: {
matchLabels: {
'app.kubernetes.io/name': 'grafana',
},
},
endpoints: [{
port: 'http',
interval: '15s',
}],
},
},
},
}
}

View File

@@ -127,9 +127,7 @@ function(params) {
action: 'drop',
regex: '(' + std.join('|',
[
'container_fs_.*', // add filesystem read/write data (nodes*disks*services*4)
'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5)
'container_blkio_device_usage_total', // useful for containers, but not for system services (nodes*disks*services*operations*2)
'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services)
'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services)
'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services)
@@ -138,6 +136,14 @@ function(params) {
'container_last_seen', // not needed as system services are always running (nodes*services)
]) + ');;',
},
{
sourceLabels: ['__name__', 'container'],
action: 'drop',
regex: '(' + std.join('|',
[
'container_blkio_device_usage_total',
]) + ');.+',
},
],
},
{

View File

@@ -12,6 +12,7 @@ local defaults = {
limits: { cpu: '250m', memory: '180Mi' },
},
listenAddress: '127.0.0.1',
filesystemMountPointsExclude: '^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)',
port: 9100,
commonLabels:: {
'app.kubernetes.io/name': defaults.name,
@@ -180,7 +181,7 @@ function(params) {
'--path.rootfs=/host/root',
'--no-collector.wifi',
'--no-collector.hwmon',
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)',
'--collector.filesystem.mount-points-exclude=' + ne._config.filesystemMountPointsExclude,
// NOTE: ignore veth network interface associated with containers.
// OVN renames veth.* to <rand-hex>@if<X> where X is /sys/class/net/<if>/ifindex
// thus [a-z0-9] regex below

View File

@@ -172,6 +172,21 @@ function(params) {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: [
{
sourceLabels: ['__name__'],
action: 'drop',
regex: '(' + std.join('|',
[
'apiserver_client_certificate_.*', // The only client supposed to connect to the aggregated API is the apiserver so it is not really meaningful to monitor its certificate.
'apiserver_envelope_.*', // Prometheus-adapter isn't using envelope for storage.
'apiserver_flowcontrol_.*', // Prometheus-adapter isn't using flowcontrol.
'apiserver_storage_.*', // Prometheus-adapter isn't using the apiserver storage.
'apiserver_webhooks_.*', // Prometeus-adapter doesn't make use of apiserver webhooks.
'workqueue_.*', // Metrics related to the internal apiserver auth workqueues are not very useful to prometheus-adapter.
]) + ')',
},
],
},
],
},
@@ -363,7 +378,7 @@ function(params) {
},
[if (defaults + params).replicas > 1 then 'podDisruptionBudget']: {
apiVersion: 'policy/v1beta1',
apiVersion: 'policy/v1',
kind: 'PodDisruptionBudget',
metadata: {
name: pa._config.name,

View File

@@ -35,6 +35,7 @@ local defaults = {
},
},
thanos: null,
reloaderPort: 8080,
};
@@ -58,6 +59,7 @@ function(params) {
targetGroups: {},
sidecar: {
selector: p._config.mixin._config.thanosSelector,
thanosPrometheusCommonDimensions: 'namespace, pod',
dimensions: std.join(', ', ['job', 'instance']),
},
},
@@ -98,13 +100,14 @@ function(params) {
spec: {
ports: [
{ name: 'web', targetPort: 'web', port: 9090 },
{ name: 'reloader-web', port: p._config.reloaderPort, targetPort: 'reloader-web' },
] +
(
if p._config.thanos != null then
[{ name: 'grpc', port: 10901, targetPort: 10901 }]
else []
),
selector: { app: 'prometheus' } + p._config.selectorLabels,
selector: p._config.selectorLabels,
sessionAffinity: 'ClientIP',
},
},
@@ -243,7 +246,7 @@ function(params) {
},
[if (defaults + params).replicas > 1 then 'podDisruptionBudget']: {
apiVersion: 'policy/v1beta1',
apiVersion: 'policy/v1',
kind: 'PodDisruptionBudget',
metadata: {
name: 'prometheus-' + p._config.name,
@@ -317,10 +320,10 @@ function(params) {
selector: {
matchLabels: p._config.selectorLabels,
},
endpoints: [{
port: 'web',
interval: '30s',
}],
endpoints: [
{ port: 'web', interval: '30s' },
{ port: 'reloader-web', interval: '30s' },
],
},
},

View File

@@ -39,7 +39,7 @@ local utils = import './lib/utils.libsonnet';
images: {
alertmanager: 'quay.io/prometheus/alertmanager:v' + $.values.common.versions.alertmanager,
blackboxExporter: 'quay.io/prometheus/blackbox-exporter:v' + $.values.common.versions.blackboxExporter,
grafana: 'grafana/grafana:v' + $.values.common.versions.grafana,
grafana: 'grafana/grafana:' + $.values.common.versions.grafana,
kubeStateMetrics: 'k8s.gcr.io/kube-state-metrics/kube-state-metrics:v' + $.values.common.versions.kubeStateMetrics,
nodeExporter: 'quay.io/prometheus/node-exporter:v' + $.values.common.versions.nodeExporter,
prometheus: 'quay.io/prometheus/prometheus:v' + $.values.common.versions.prometheus,

View File

@@ -1,3 +1,3 @@
# Adding a new platform specific configuration
Adding a new platform specific configuration requires to update the [README](../../../README.md#cluster-creation-tools) and the [platforms.jsonnet](./platform.jsonnet) file by adding the platform to the list of existing ones. This allow the new platform to be discoverable and easily configurable by the users.
Adding a new platform specific configuration requires to update the [README](../../../README.md#cluster-creation-tools) and the [platforms.libsonnet](platforms.libsonnet) file by adding the platform to the list of existing ones. This allow the new platform to be discoverable and easily configurable by the users.

View File

@@ -1,12 +1,12 @@
{
"alertmanager": "0.23.0",
"blackboxExporter": "0.19.0",
"grafana": "8.1.3",
"kubeStateMetrics": "2.2.0",
"grafana": "8.2.1",
"kubeStateMetrics": "2.2.3",
"nodeExporter": "1.2.2",
"prometheus": "2.29.2",
"prometheusAdapter": "0.9.0",
"prometheusOperator": "0.50.0",
"prometheus": "2.30.3",
"prometheusAdapter": "0.9.1",
"prometheusOperator": "0.51.2",
"kubeRbacProxy": "0.11.0",
"configmapReload": "0.5.0"
}

View File

@@ -8,8 +8,8 @@
"subdir": "grafana"
}
},
"version": "c3b14b24b83cfe9abf1064649d19e2d679f033fb",
"sum": "YrE4DNQsWgYWs6h0j/FjQETt8xDXdYdsslb1WK7xQEk="
"version": "199e363523104ff8b3a12483a4e3eca86372b078",
"sum": "/jDHzVAjHB4AOLkJHw1GyATX5ogZ1iMdcJXZAgaG3+g="
},
{
"source": {
@@ -18,8 +18,8 @@
"subdir": "contrib/mixin"
}
},
"version": "c2937d78d2722d774f69dbf91a956f382d32f4d3",
"sum": "5XhYOigrKipOWDbIn9hlrz7JcbelzvJnormxSaup9JI="
"version": "38a7d79810bd273bd078bf0931480b743afee003",
"sum": "drRRtMPhvpUZ8v7Weqz7Cg2pwDA2cSb6X1pjBPoCx1w="
},
{
"source": {
@@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "05fb200ee1a1816fc1b4c522071d5606d8dd71c1",
"sum": "mEoObbqbyVaXrHFEJSM2Nad31tOvadzIevWuyNHHBgI="
"version": "3626fc4dc2326931c530861ac5bebe39444f6cbf",
"sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
},
{
"source": {
@@ -38,8 +38,8 @@
"subdir": "grafana-builder"
}
},
"version": "746874e4836a4bfbb7034d32de0c98ab1282aaae",
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
"version": "87b6b50706dfa57b2470470422770f8e7574b7db",
"sum": "U34Nd1ViO2LZ3D8IzygPPRfUcy6zOgCnTMVHZ+9O/QE="
},
{
"source": {
@@ -48,8 +48,8 @@
"subdir": ""
}
},
"version": "2b27a09a667091cef74776b690ccceaf55995e29",
"sum": "j2jPdrcM3iuaUK+6V9jWn2M3Fapr0KtI8FZ1KQoHIGA="
"version": "8dc2c0d69f762d943c5bfbdcc17645e346d610ca",
"sum": "TamniMXp0Jy6E5OMOYtcrTJ1P+rFTVNuiOZSkxvckb8="
},
{
"source": {
@@ -58,7 +58,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "2b27a09a667091cef74776b690ccceaf55995e29",
"version": "8dc2c0d69f762d943c5bfbdcc17645e346d610ca",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@@ -68,8 +68,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734",
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
"version": "b730cb415234509e6a1425c79e826f2e7688d27b",
"sum": "U1wzIpTAtOvC1yj43Y8PfvT0JfvnAcMfNH12Wi+ab0Y="
},
{
"source": {
@@ -78,7 +78,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734",
"version": "b730cb415234509e6a1425c79e826f2e7688d27b",
"sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk="
},
{
@@ -88,8 +88,8 @@
"subdir": "jsonnet/mixin"
}
},
"version": "2c81b0cf6a5673e08057499a08ddce396b19dda4",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"version": "f710e9d66a09efdb8edc144af555718b7d7ed2e3",
"sum": "qZ4WgiweaE6eeKtFK60QUjLO8sf2L9Q8fgafWvDcyfY=",
"name": "prometheus-operator-mixin"
},
{
@@ -99,8 +99,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "2c81b0cf6a5673e08057499a08ddce396b19dda4",
"sum": "WUuFzKqxzxmTWLeic/IU1SMjdCV/zClt11MHucJ9MSc="
"version": "f710e9d66a09efdb8edc144af555718b7d7ed2e3",
"sum": "4e3A/CccaxvLdWFPKJlC/P9RbPhSX6cH/Nj8+N1DBzg="
},
{
"source": {
@@ -109,7 +109,7 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "44011410d7065487789c447ce55157ae6e0b917d",
"version": "1b8afe7cb5aafe59442e35979ec57401145ea26b",
"sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=",
"name": "alertmanager"
},
@@ -120,8 +120,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "dc68e035a5b37a9a3b47e1547f07d96df29ba575",
"sum": "OFNs9Te1QMqSscXqNqMv0zwaJoJxaEg7NyQVNyT4VeA="
"version": "a59b2d89903229db0019f73200ec209758f2fd26",
"sum": "Yr1xB+EEdBYRbsCtl4MDvx6phDg3UoMQtfpWADHyeGk="
},
{
"source": {
@@ -130,7 +130,7 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "46286cb6abfff961e8c257de091443e835ec444f",
"version": "c092a74be9cc3e8e3db41efe3136128cef6c1add",
"sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=",
"name": "prometheus"
},
@@ -141,8 +141,8 @@
"subdir": "mixin"
}
},
"version": "2dd8c22e8c15f5ec0daaa07ae20be44bed419aa5",
"sum": "X+060DnePPeN/87fgj0SrfxVitywTk8hZA9V4nHxl1g=",
"version": "d2d53e575b489a8cbfc9e1723d0e3f62a68faf39",
"sum": "Og+wEHfgzXBvBLAeeQvGNoiCw3FY4LQHlJdpsG/owj8=",
"name": "thanos-mixin"
},
{

View File

@@ -1,4 +1,4 @@
apiVersion: policy/v1beta1
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:

View File

@@ -16,7 +16,8 @@ spec:
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
description: Configuration has failed to load for {{ $labels.namespace }}/{{
$labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
@@ -28,9 +29,11 @@ spec:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
summary: A member of an Alertmanager cluster has not found all other cluster
members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@@ -42,7 +45,9 @@ spec:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
@@ -57,9 +62,12 @@ spec:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
@@ -72,9 +80,12 @@ spec:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
@@ -87,7 +98,8 @@ spec:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
description: Alertmanager instances within the {{$labels.job}} cluster have
different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
@@ -100,9 +112,12 @@ spec:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of the
last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
summary: Half or more of the Alertmanager instances within the same cluster
are down.
expr: |
(
count by (namespace,service) (
@@ -119,9 +134,12 @@ spec:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in the
last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
expr: |
(
count by (namespace,service) (

View File

@@ -14,9 +14,11 @@ spec:
- name: web
port: 9093
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
alertmanager: main
app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@@ -12,6 +12,8 @@ spec:
endpoints:
- interval: 30s
port: web
- interval: 30s
port: reloader-web
selector:
matchLabels:
alertmanager: main

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana-config
namespace: monitoring
stringData:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana-datasources
namespace: monitoring
stringData:

File diff suppressed because it is too large Load Diff

View File

@@ -22,6 +22,6 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana-dashboards
namespace: monitoring

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana
namespace: monitoring
spec:
@@ -18,18 +18,18 @@ spec:
template:
metadata:
annotations:
checksum/grafana-config: e1f5b84a1d40edb8a6527c98d24ff656
checksum/grafana-dashboardproviders: 2c7c248e5512bb5576d633004725159c
checksum/grafana-datasources: b2cbbea3079b8634b7bdf42cb56c1537
checksum/grafana-config: 11905dc0549e921f5d3befd288dbf9d5
checksum/grafana-dashboardproviders: 4278ba47b6379fd0ee12ad9c15fedda2
checksum/grafana-datasources: c83e12e4791b0aef701753f70bfc1fe9
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
spec:
containers:
- env: []
image: grafana/grafana:8.1.3
image: grafana/grafana:8.2.1
name: grafana
ports:
- containerPort: 3000

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana
namespace: monitoring
spec:

View File

@@ -1,5 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.2.1
name: grafana
namespace: monitoring

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.3
app.kubernetes.io/version: 8.2.1
name: grafana
namespace: monitoring
spec:

View File

@@ -15,10 +15,12 @@ spec:
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
for: 10m
labels:
severity: warning
@@ -31,7 +33,8 @@ spec:
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager is working properly.
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: none
@@ -39,7 +42,8 @@ spec:
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
description: Network interface "{{ $labels.device }}" changing its up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: |
@@ -49,17 +53,21 @@ spec:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
rules:
- apiGroups:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
namespace: monitoring
spec:
@@ -23,7 +23,7 @@ spec:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
spec:
containers:
- args:
@@ -31,7 +31,7 @@ spec:
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.2.0
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.2.3
name: kube-state-metrics
resources:
limits:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
prometheus: k8s
role: alert-rules
name: kube-state-metrics-rules
@@ -16,7 +16,9 @@ spec:
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
@@ -29,7 +31,9 @@ spec:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
@@ -42,7 +46,9 @@ spec:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
description: kube-state-metrics pods are running with different --total-shards
configuration, some Kubernetes objects may be exposed multiple times or
not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: |
@@ -52,7 +58,8 @@ spec:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
description: kube-state-metrics shards are missing, some Kubernetes objects
are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
namespace: monitoring
spec:

View File

@@ -5,6 +5,6 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
namespace: monitoring

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.2.0
app.kubernetes.io/version: 2.2.3
name: kube-state-metrics
namespace: monitoring
spec:

View File

@@ -14,19 +14,19 @@ spec:
rules:
- alert: KubePodCrashLooping
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
expr: |
increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0
and
kube_pod_container_status_waiting{job="kube-state-metrics"} == 1
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |
@@ -42,7 +42,9 @@ spec:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |
@@ -54,7 +56,8 @@ spec:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
@@ -72,7 +75,8 @@ spec:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
@@ -90,7 +94,9 @@ spec:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |
@@ -102,7 +108,8 @@ spec:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
update has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |
@@ -128,7 +135,8 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
not finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |
@@ -160,7 +168,8 @@ spec:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: |
@@ -170,7 +179,8 @@ spec:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |
@@ -182,7 +192,8 @@ spec:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: |
@@ -192,7 +203,8 @@ spec:
severity: warning
- alert: KubeJobCompletion
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
more than 12 hours to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion
summary: Job did not complete in time
expr: |
@@ -202,7 +214,8 @@ spec:
severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete. Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: |
@@ -212,7 +225,8 @@ spec:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: |
@@ -234,7 +248,8 @@ spec:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |
@@ -248,7 +263,8 @@ spec:
rules:
- alert: KubeCPUOvercommit
annotations:
description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
description: Cluster has overcommitted CPU resource requests for Pods by {{
$value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |
@@ -260,7 +276,8 @@ spec:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure.
description: Cluster has overcommitted memory resource requests for Pods by
{{ $value }} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |
@@ -298,7 +315,8 @@ spec:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |
@@ -311,7 +329,8 @@ spec:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |
@@ -324,7 +343,8 @@ spec:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |
@@ -337,7 +357,9 @@ spec:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |
@@ -352,7 +374,9 @@ spec:
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
@@ -363,12 +387,17 @@ spec:
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
description: Based on recent sampling, the PersistentVolume claimed by {{
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is
expected to fill up within four days. Currently {{ $value | humanizePercentage
}} is available.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
@@ -381,12 +410,15 @@ spec:
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
description: The persistent volume {{ $labels.persistentvolume }} has status
{{ $labels.phase }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: |
@@ -398,7 +430,8 @@ spec:
rules:
- alert: KubeVersionMismatch
annotations:
description: There are {{ $value }} different semantic versions of Kubernetes components running.
description: There are {{ $value }} different semantic versions of Kubernetes
components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: |
@@ -408,7 +441,8 @@ spec:
severity: warning
- alert: KubeClientErrors
annotations:
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |
@@ -481,7 +515,8 @@ spec:
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 7.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
@@ -490,7 +525,8 @@ spec:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 24.0 hours.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
@@ -499,7 +535,9 @@ spec:
severity: critical
- alert: AggregatedAPIErrors
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has reported errors. It has appeared unavailable {{ $value | humanize }}
times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors
summary: An aggregated API has reported errors.
expr: |
@@ -508,7 +546,8 @@ spec:
severity: warning
- alert: AggregatedAPIDown
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown
summary: An aggregated API is down.
expr: |
@@ -528,9 +567,11 @@ spec:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
description: The apiserver has terminated {{ $value | humanizePercentage }}
of its incoming requests.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
summary: The apiserver has terminated {{ $value | humanizePercentage }} of
its incoming requests.
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
@@ -550,7 +591,8 @@ spec:
severity: warning
- alert: KubeNodeUnreachable
annotations:
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
description: '{{ $labels.node }} is unreachable and some workloads may be
rescheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
summary: Node is unreachable.
expr: |
@@ -560,7 +602,8 @@ spec:
severity: warning
- alert: KubeletTooManyPods
annotations:
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |
@@ -576,7 +619,8 @@ spec:
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
description: The readiness status of node {{ $labels.node }} has changed {{
$value }} times in the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: |
@@ -586,7 +630,8 @@ spec:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: |
@@ -596,7 +641,8 @@ spec:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: |
@@ -606,7 +652,8 @@ spec:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
@@ -615,7 +662,8 @@ spec:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: |
@@ -624,7 +672,8 @@ spec:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
@@ -633,7 +682,8 @@ spec:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: |
@@ -642,7 +692,8 @@ spec:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
description: Kubelet on node {{ $labels.node }} has failed to renew its client
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: |
@@ -652,7 +703,8 @@ spec:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
description: Kubelet on node {{ $labels.node }} has failed to renew its server
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: |
@@ -686,7 +738,8 @@ spec:
rules:
- alert: KubeControllerManagerDown
annotations:
description: KubeControllerManager has disappeared from Prometheus target discovery.
description: KubeControllerManager has disappeared from Prometheus target
discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: |
@@ -694,6 +747,18 @@ spec:
for: 15m
labels:
severity: critical
- name: kubernetes-system-kube-proxy
rules:
- alert: KubeProxyDown
annotations:
description: KubeProxy has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-proxy"} == 1)
for: 15m
labels:
severity: critical
- name: kube-apiserver-burnrate.rules
rules:
- expr: |

View File

@@ -61,11 +61,16 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: (container_fs_.*|container_spec_.*|container_blkio_device_usage_total|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
sourceLabels:
- __name__
- pod
- namespace
- action: drop
regex: (container_blkio_device_usage_total);.+
sourceLabels:
- __name__
- container
path: /metrics/cadvisor
port: https-metrics
relabelings:

View File

@@ -29,7 +29,7 @@ spec:
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image: quay.io/prometheus/node-exporter:v1.2.2

View File

@@ -16,7 +16,9 @@ spec:
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
@@ -32,7 +34,9 @@ spec:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
@@ -48,7 +52,8 @@ spec:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
@@ -62,7 +67,8 @@ spec:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
@@ -76,7 +82,9 @@ spec:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
@@ -92,7 +100,9 @@ spec:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
@@ -108,7 +118,8 @@ spec:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
@@ -122,7 +133,8 @@ spec:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
@@ -136,7 +148,8 @@ spec:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
@@ -146,7 +159,8 @@ spec:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
@@ -174,7 +188,8 @@ spec:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
description: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |
@@ -194,7 +209,8 @@ spec:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
description: Clock on {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
@@ -206,7 +222,9 @@ spec:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
expr: |
@@ -216,7 +234,8 @@ spec:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
@@ -225,7 +244,8 @@ spec:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
@@ -237,7 +257,8 @@ spec:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
@@ -269,6 +290,16 @@ spec:
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
rules:
- apiGroups:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: resource-metrics:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: resource-metrics-server-resources
rules:
- apiGroups:

View File

@@ -64,6 +64,6 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: adapter-config
namespace: monitoring

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
namespace: monitoring
spec:
@@ -25,7 +25,7 @@ spec:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
spec:
containers:
- args:
@@ -36,7 +36,7 @@ spec:
- --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090/
- --secure-port=6443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.0
image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
name: prometheus-adapter
ports:
- containerPort: 6443

View File

@@ -1,11 +1,11 @@
apiVersion: policy/v1beta1
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
namespace: monitoring
spec:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: resource-metrics-auth-reader
namespace: kube-system
roleRef:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
namespace: monitoring
spec:

View File

@@ -5,6 +5,6 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
namespace: monitoring

View File

@@ -5,13 +5,18 @@ metadata:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.9.0
app.kubernetes.io/version: 0.9.1
name: prometheus-adapter
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*)
sourceLabels:
- __name__
port: https
scheme: https
tlsConfig:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
rules:
- apiGroups:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.50.0
app.kubernetes.io/version: 0.51.2
prometheus: k8s
role: alert-rules
name: prometheus-operator-rules
@@ -16,7 +16,8 @@ spec:
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
@@ -26,7 +27,8 @@ spec:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
@@ -36,7 +38,8 @@ spec:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
@@ -46,7 +49,9 @@ spec:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
namespace.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
@@ -56,7 +61,8 @@ spec:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
@@ -66,7 +72,8 @@ spec:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
@@ -76,7 +83,9 @@ spec:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
description: Prometheus operator in {{ $labels.namespace }} namespace rejected
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource
}} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: |
@@ -84,3 +93,17 @@ spec:
for: 5m
labels:
severity: warning
- name: config-reloaders
rules:
- alert: ConfigReloaderSidecarErrors
annotations:
description: |-
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: |
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
labels:
severity: warning

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.50.0
app.kubernetes.io/version: 0.51.2
name: prometheus-operator
namespace: monitoring
spec:
@@ -21,4 +21,4 @@ spec:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.50.0
app.kubernetes.io/version: 0.51.2

View File

@@ -1,11 +1,11 @@
apiVersion: policy/v1beta1
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: monitoring
spec:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
prometheus: k8s
name: k8s
namespace: monitoring
@@ -18,7 +18,7 @@ spec:
port: web
enableFeatures: []
externalLabels: {}
image: quay.io/prometheus/prometheus:v2.29.2
image: quay.io/prometheus/prometheus:v2.30.3
nodeSelector:
kubernetes.io/os: linux
podMetadata:
@@ -26,7 +26,7 @@ spec:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
@@ -44,4 +44,4 @@ spec:
serviceAccountName: prometheus-k8s
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
version: 2.29.2
version: 2.30.3

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
@@ -16,7 +16,8 @@ spec:
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
reload its configuration.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
@@ -28,9 +29,11 @@ spec:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less than 30m.
summary: Prometheus alert notification queue predicted to run full in less
than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@@ -44,9 +47,11 @@ spec:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
summary: Prometheus has encountered more than 1% errors sending alerts to
a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
@@ -60,7 +65,8 @@ spec:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
to any Alertmanagers.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
@@ -72,7 +78,8 @@ spec:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} reload failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
@@ -82,7 +89,8 @@ spec:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
@@ -92,7 +100,8 @@ spec:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
samples.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
@@ -110,7 +119,9 @@ spec:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
@@ -120,7 +131,8 @@ spec:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
@@ -130,7 +142,9 @@ spec:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
@@ -150,7 +164,9 @@ spec:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
@@ -167,9 +183,14 @@ spec:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards for queue {{
$labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{
printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
$labels.instance | query | first | value }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@@ -183,7 +204,8 @@ spec:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
@@ -193,7 +215,8 @@ spec:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
@@ -203,9 +226,12 @@ spec:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because the number of targets exceeded
the configured target_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
@@ -213,9 +239,12 @@ spec:
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because some samples exceeded the configured
label_limit, label_name_length_limit or label_value_length_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the labels limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
@@ -223,7 +252,8 @@ spec:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}}
have failed to sync because invalid configuration was supplied.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
summary: Prometheus has failed to sync targets.
expr: |
@@ -233,7 +263,8 @@ spec:
severity: critical
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s-config
namespace: monitoring
roleRef:

View File

@@ -7,7 +7,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: default
roleRef:
@@ -25,7 +25,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: kube-system
roleRef:
@@ -43,7 +43,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: monitoring
roleRef:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s-config
namespace: monitoring
rules:

View File

@@ -7,7 +7,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: default
rules:
@@ -44,7 +44,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: kube-system
rules:
@@ -81,7 +81,7 @@ items:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: monitoring
rules:

View File

@@ -5,7 +5,7 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
@@ -14,8 +14,10 @@ spec:
- name: web
port: 9090
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
app: prometheus
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus

View File

@@ -5,6 +5,6 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: monitoring

View File

@@ -5,13 +5,15 @@ metadata:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.2
app.kubernetes.io/version: 2.30.3
name: prometheus-k8s
namespace: monitoring
spec:
endpoints:
- interval: 30s
port: web
- interval: 30s
port: reloader-web
selector:
matchLabels:
app.kubernetes.io/component: prometheus

Some files were not shown because too many files have changed in this diff Show More