Skip to content

Commit 31ab8f8

Browse files
Diogo GuerraAlex-Welsh
authored andcommitted
Update magnum k8s monitoring infra
* Prometheus-server now runs only on master nodes. * Update prometheus-operator helm chart and tag. * Update prometheus-adapter version. * Deprecation notice for prometheus_monitoring component. task: 41569 story: 2006765 Signed-off-by: Diogo Guerra <[email protected]> Change-Id: I05e8c2be4e4c8e66a166b485ec7851875dca8b1c (cherry picked from commit 0934160) (cherry picked from commit 9a47696) (cherry picked from commit 3041908) (cherry picked from commit 5f786f0) (cherry picked from commit 4abaf43) (cherry picked from commit 7796430)
1 parent d940659 commit 31ab8f8

File tree

7 files changed

+1660
-87
lines changed

7 files changed

+1660
-87
lines changed

doc/source/user/index.rst

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,13 +1185,14 @@ _`container_infra_prefix`
11851185

11861186
Images that might be needed if 'monitoring_enabled' is 'true':
11871187

1188-
* quay.io/prometheus/alertmanager:v0.20.0
1189-
* docker.io/squareup/ghostunnel:v1.5.2
1190-
* docker.io/jettech/kube-webhook-certgen:v1.0.0
1191-
* quay.io/coreos/prometheus-operator:v0.37.0
1192-
* quay.io/coreos/configmap-reload:v0.0.1
1193-
* quay.io/coreos/prometheus-config-reloader:v0.37.0
1194-
* quay.io/prometheus/prometheus:v2.15.2
1188+
* quay.io/prometheus/alertmanager:v0.21.0
1189+
* docker.io/jettech/kube-webhook-certgen:v1.5.0
1190+
* quay.io/prometheus-operator/prometheus-operator:v0.44.0
1191+
* docker.io/jimmidyson/configmap-reload:v0.4.0
1192+
* quay.io/prometheus-operator/prometheus-config-reloader:v0.44.0
1193+
* quay.io/prometheus/prometheus:v2.22.1
1194+
* quay.io/prometheus/node-exporter:v1.0.1
1195+
* docker.io/directxman12/k8s-prometheus-adapter:v0.8.2
11951196

11961197
Images that might be needed if 'cinder_csi_enabled' is 'true':
11971198

doc/source/user/monitoring.rst

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@ _`metrics_server_enabled`
3333

3434
_`monitoring_enabled`
3535
Enable installation of cluster monitoring solution provided by the
36-
stable/prometheus-operator helm chart.
36+
prometheus-community/kube-prometheus-stack helm chart.
37+
To use this service tiller_enabled must be true when using
38+
helm_client_tag<v3.0.0.
3739
Default: false
3840

3941
_`prometheus_adapter_enabled`
4042
Enable installation of cluster custom metrics provided by the
41-
stable/prometheus-adapter helm chart. This service depends on
42-
monitoring_enabled.
43+
prometheus-community/prometheus-adapter helm chart.
44+
This service depends on monitoring_enabled.
4345
Default: true
4446

4547
To control deployed versions, extra labels are available:
@@ -52,14 +54,17 @@ _`metrics_server_chart_tag`
5254

5355
_`prometheus_operator_chart_tag`
5456
Add prometheus_operator_chart_tag to select version of the
55-
stable/prometheus-operator chart to install. When installing the chart,
56-
helm will use the default values of the tag defined and overwrite them based
57-
on the prometheus-operator-config ConfigMap currently defined. You must
58-
certify that the versions are compatible.
57+
prometheus-community/kube-prometheus-stack chart to install.
58+
When installing the chart, helm will use the default values of the tag
59+
defined and overwrite them based on the prometheus-operator-config
60+
ConfigMap currently defined.
61+
You must certify that the versions are compatible.
62+
Wallaby-default: 17.2.0
5963

6064
_`prometheus_adapter_chart_tag`
61-
The stable/prometheus-adapter helm chart version to use.
65+
The prometheus-community/prometheus-adapter helm chart version to use.
6266
Train-default: 1.4.0
67+
Wallaby-default: 2.12.1
6368

6469
Full fledged cluster monitoring
6570
+++++++++++++++++++++++++++++++

magnum/drivers/common/templates/kubernetes/helm/prometheus-adapter.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ EOF
2121
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
2222
prometheus-adapter:
2323
image:
24-
repository: ${CONTAINER_INFRA_PREFIX:-docker.io/directxman12/}k8s-prometheus-adapter-${ARCH}
24+
repository: ${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/prometheus-adapter/}prometheus-adapter
2525
priorityClassName: "system-cluster-critical"
2626
prometheus:
27-
url: http://web.tcp.prometheus-prometheus.kube-system.svc.cluster.local
27+
url: http://web.tcp.magnum-kube-prometheus-sta-prometheus.kube-system.svc.cluster.local
28+
path: /prometheus
2829
resources:
2930
requests:
3031
cpu: 150m

magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh

Lines changed: 66 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ set +x
22
. /etc/sysconfig/heat-params
33
set -ex
44

5-
CHART_NAME="prometheus-operator"
5+
CHART_NAME="kube-prometheus-stack"
66

77
if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then
88
echo "Writing ${CHART_NAME} config"
@@ -80,22 +80,18 @@ EOF
8080
PROTOCOL="http"
8181
INSECURE_SKIP_VERIFY="True"
8282
fi
83-
# FIXME: Force protocol to http as we don't want to use the cluster certs
84-
USE_HTTPS="False"
8583

8684
if [ "$(echo ${VERIFY_CA} | tr '[:upper:]' '[:lower:]')" == "false" ]; then
8785
INSECURE_SKIP_VERIFY="True"
8886
fi
8987

9088
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
91-
prometheus-operator:
92-
93-
defaultRules:
94-
rules:
95-
#TODO: To enable this we need firstly take care of exposing certs
96-
etcd: false
89+
kube-prometheus-stack:
9790
9891
alertmanager:
92+
podDisruptionBudget:
93+
enabled: true
94+
#config:
9995
ingress:
10096
enabled: ${MONITORING_INGRESS_ENABLED}
10197
annotations:
@@ -108,6 +104,7 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
108104
- ${CLUSTER_ROOT_DOMAIN_NAME}
109105
paths:
110106
- /alertmanager${APP_INGRESS_PATH_APPEND}
107+
pathType: ImplementationSpecific
111108
## TLS configuration for Alertmanager Ingress
112109
## Secret must be manually created in the namespace
113110
tls: []
@@ -118,24 +115,16 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
118115
image:
119116
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}alertmanager
120117
logFormat: json
118+
routePrefix: /alertmanager
121119
externalUrl: https://${CLUSTER_ROOT_DOMAIN_NAME}/alertmanager
122-
# routePrefix: /alertmanager
123120
# resources:
124121
# requests:
125122
# cpu: 100m
126123
# memory: 256Mi
127124
priorityClassName: "system-cluster-critical"
128125
129126
grafana:
130-
image:
131-
repository: ${CONTAINER_INFRA_PREFIX:-grafana/}grafana
132127
#enabled: ${ENABLE_GRAFANA}
133-
sidecar:
134-
image: ${CONTAINER_INFRA_PREFIX:-kiwigrid/}k8s-sidecar:0.1.99
135-
resources:
136-
requests:
137-
cpu: 100m
138-
memory: 128Mi
139128
adminPassword: ${GRAFANA_ADMIN_PASSWD}
140129
ingress:
141130
enabled: ${MONITORING_INGRESS_ENABLED}
@@ -146,13 +135,24 @@ ${APP_INGRESS_ANNOTATIONS}
146135
## Must be provided if Ingress is enable.
147136
hosts:
148137
- ${CLUSTER_ROOT_DOMAIN_NAME}
149-
path: /grafana${APP_INGRESS_PATH_APPEND}
138+
paths:
139+
- /grafana${APP_INGRESS_PATH_APPEND}
140+
pathType: ImplementationSpecific
150141
## TLS configuration for grafana Ingress
151142
## Secret must be manually created in the namespace
152143
tls: []
153144
# - secretName: grafana-general-tls
154145
# hosts:
155146
# - grafana.example.com
147+
sidecar:
148+
image:
149+
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/kiwigrid/}k8s-sidecar
150+
image:
151+
repository: ${CONTAINER_INFRA_PREFIX:-grafana/}grafana
152+
resources:
153+
requests:
154+
cpu: 100m
155+
memory: 128Mi
156156
persistence:
157157
enabled: ${APP_GRAFANA_PERSISTENT_STORAGE}
158158
storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
@@ -162,21 +162,10 @@ ${APP_INGRESS_ANNOTATIONS}
162162
domain: ${CLUSTER_ROOT_DOMAIN_NAME}
163163
root_url: https://${CLUSTER_ROOT_DOMAIN_NAME}/grafana
164164
serve_from_sub_path: true
165-
paths:
166-
data: /var/lib/grafana/data
167-
logs: /var/log/grafana
168-
plugins: /var/lib/grafana/plugins
169-
provisioning: /etc/grafana/provisioning
170-
analytics:
171-
check_for_updates: true
172165
log:
173166
mode: console
174167
log.console:
175168
format: json
176-
grafana_net:
177-
url: https://grafana.net
178-
plugins:
179-
- grafana-piechart-panel
180169
181170
kubeApiServer:
182171
tlsConfig:
@@ -198,9 +187,9 @@ ${APP_INGRESS_ANNOTATIONS}
198187
serviceMonitor:
199188
## Enable scraping kube-controller-manager over https.
200189
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
201-
https: ${USE_HTTPS}
190+
https: "True"
202191
# Skip TLS certificate validation when scraping
203-
insecureSkipVerify: null
192+
insecureSkipVerify: "True"
204193
# Name of the server to use when validating TLS certificate
205194
serverName: null
206195
@@ -242,19 +231,21 @@ ${APP_INGRESS_ANNOTATIONS}
242231
serviceMonitor:
243232
## Enable scraping kube-scheduler over https.
244233
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
245-
https: ${USE_HTTPS}
234+
https: "True"
246235
## Skip TLS certificate validation when scraping
247-
insecureSkipVerify: null
236+
insecureSkipVerify: "True"
248237
## Name of the server to use when validating TLS certificate
249238
serverName: null
250239
251-
# kubeProxy:
252-
# ## If your kube proxy is not deployed as a pod, specify IPs it can be found on
253-
# endpoints: [] # masters + minions
254-
# serviceMonitor:
255-
# ## Enable scraping kube-proxy over https.
256-
# ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
257-
# https: ${USE_HTTPS}
240+
kubeProxy:
241+
## If your kube proxy is not deployed as a pod, specify IPs it can be found on
242+
endpoints: ${KUBE_MASTERS_PRIVATE} # masters + minions
243+
serviceMonitor:
244+
## Enable scraping kube-proxy over https.
245+
## Requires proper certs (not self-signed) and delegated authentication/authorization checks
246+
https: "True"
247+
## Skip TLS certificate validation when scraping
248+
insecureSkipVerify: "True"
258249
259250
kube-state-metrics:
260251
priorityClassName: "system-cluster-critical"
@@ -271,37 +262,34 @@ ${APP_INGRESS_ANNOTATIONS}
271262
limits:
272263
cpu: 20m
273264
memory: 20M
274-
extraArgs:
275-
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
276-
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
277-
sidecars: []
278-
## - name: nvidia-dcgm-exporter
279-
## image: nvidia/dcgm-exporter:1.4.3
280265
281266
prometheusOperator:
282-
priorityClassName: "system-cluster-critical"
283-
tlsProxy:
284-
image:
285-
repository: ${CONTAINER_INFRA_PREFIX:-squareup/}ghostunnel
286267
admissionWebhooks:
287268
patch:
288269
image:
289270
repository: ${CONTAINER_INFRA_PREFIX:-jettech/}kube-webhook-certgen
290-
priorityClassName: "system-cluster-critical"
291-
292-
resources: {}
293-
# requests:
294-
# cpu: 5m
295-
# memory: 10Mi
271+
resources:
272+
requests:
273+
cpu: 2m
274+
limits:
275+
memory: 30M
276+
# clusterDomain: ${CLUSTER_ROOT_DOMAIN_NAME}
277+
priorityClassName: "system-cluster-critical"
296278
logFormat: json
279+
logLevel: info
280+
resources:
281+
requests:
282+
cpu: 2m
283+
limits:
284+
memory: 32M
297285
image:
298-
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-operator
299-
configmapReloadImage:
300-
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}configmap-reload
286+
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus-operator/}prometheus-operator
287+
prometheusDefaultBaseImage: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}prometheus
288+
alertmanagerDefaultBaseImage: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}alertmanager
301289
prometheusConfigReloaderImage:
302-
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-config-reloader
303-
hyperkubeImage:
304-
repository: ${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}hyperkube
290+
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus-operator/}prometheus-config-reloader
291+
thanosImage:
292+
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/thanos/}thanos
305293
306294
prometheus:
307295
ingress:
@@ -317,6 +305,7 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
317305
- ${CLUSTER_ROOT_DOMAIN_NAME}
318306
paths:
319307
- /prometheus${APP_INGRESS_PATH_APPEND}
308+
pathType: ImplementationSpecific
320309
## TLS configuration for Prometheus Ingress
321310
## Secret must be manually created in the namespace
322311
tls: []
@@ -332,11 +321,13 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
332321
bearerTokenFile:
333322
prometheusSpec:
334323
scrapeInterval: ${MONITORING_INTERVAL_SECONDS}s
335-
scrapeInterval: 30s
336324
evaluationInterval: 30s
337325
image:
338326
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}prometheus
339-
retention: 14d
327+
tolerations:
328+
- key: "node-role.kubernetes.io/master"
329+
operator: "Exists"
330+
effect: "NoSchedule"
340331
externalLabels:
341332
cluster_uuid: ${CLUSTER_UUID}
342333
externalUrl: https://${CLUSTER_ROOT_DOMAIN_NAME}/prometheus
@@ -352,7 +343,16 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
352343
retention: ${MONITORING_RETENTION_DAYS}d
353344
retentionSize: ${MONITORING_RETENTION_SIZE_GB}GB
354345
logFormat: json
355-
#routePrefix: /prometheus
346+
routePrefix: /prometheus
347+
affinity:
348+
nodeAffinity:
349+
requiredDuringSchedulingIgnoredDuringExecution:
350+
nodeSelectorTerms:
351+
- matchExpressions:
352+
- key: magnum.openstack.org/role
353+
operator: In
354+
values:
355+
- master
356356
resources:
357357
requests:
358358
cpu: ${PROMETHEUS_SERVER_CPU}m

0 commit comments

Comments
 (0)