From 8789717c481beb6065184251abf3468f7e5bd9b8 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Mon, 29 Sep 2025 09:56:20 +0200 Subject: [PATCH 1/7] Adding kubernetes monitoring ... --- .../kube-prometheus-stack/values.yaml.gotmpl | 97 +++++++++++++++++++ charts/monitoring/namespace.yaml | 6 ++ 2 files changed, 103 insertions(+) create mode 100644 charts/kube-prometheus-stack/values.yaml.gotmpl create mode 100644 charts/monitoring/namespace.yaml diff --git a/charts/kube-prometheus-stack/values.yaml.gotmpl b/charts/kube-prometheus-stack/values.yaml.gotmpl new file mode 100644 index 00000000..8cbb8988 --- /dev/null +++ b/charts/kube-prometheus-stack/values.yaml.gotmpl @@ -0,0 +1,97 @@ +alertmanager: + enabled: false + +defaultRules: + create: false + +grafana: + enabled: false + +kubeApiServer: + enabled: false + +kubelet: + enabled: false + +kubeControllerManager: + enabled: false + +coreDns: + enabled: false + +kubeEtcd: + enabled: false + +kubeScheduler: + enabled: false + +kubeDns: + enabled: false + +kubeProxy: + enabled: false + +kubeStateMetrics: + enabled: false + +nodeExporter: + enabled: true + +thanosRuler: + enabled: false + +prometheusOperator: + enabled: true + + networkPolicy: &networkPolicy + enabled: true + flavor: Kubernetes + + resources: + limits: + cpu: 1 + memory: 1Gi + requests: + cpu: 0.1 + memory: 256Mi + + nodeSelector: + ops: "true" + +prometheus: + enabled: true + + networkPolicy: *networkPolicy + + # enable once needed (object storage) + thanosService: + enabled: false + + ingress: + enabled: true + annotations: + namespace: "{{ .Release.Namespace }}" + cert-manager.io/cluster-issuer: "cert-issuer" + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.middlewares: traefik-traefik-basic-auth@kubernetescrd # namespace + middleware name + tls: + - secretName: monitoring-tls + hosts: + - {{ requiredEnv "K8S_MONITORING_FQDN" }} + hosts: + - {{ requiredEnv "K8S_MONITORING_FQDN" }} + paths: + - /prometheus &pathprefix + pathType: Prefix + + prometheusSpec: + routePrefix: *pathprefix + + retention: 90d + + retentionSize: 100GiB + + scrape_interval: 15s + + nodeSelector: + ops: "true" diff --git a/charts/monitoring/namespace.yaml b/charts/monitoring/namespace.yaml new file mode 100644 index 00000000..eb788b6a --- /dev/null +++ b/charts/monitoring/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + pod-security.kubernetes.io/enforce: restricted From b626106f53817acb5d97afe9b4c0919c86521ca2 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Wed, 1 Oct 2025 09:34:46 +0200 Subject: [PATCH 2/7] Improve makefile --- charts/Makefile | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/charts/Makefile b/charts/Makefile index 39844abb..56ca7f82 100644 --- a/charts/Makefile +++ b/charts/Makefile @@ -4,49 +4,62 @@ REPO_BASE_DIR := $(shell git rev-parse --show-toplevel) include ${REPO_BASE_DIR}/scripts/common.Makefile include $(REPO_CONFIG_LOCATION) +# +# Vars +# + CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION)) CHART_DIRS := $(wildcard $(REPO_BASE_DIR)/charts/*/) +HELMFILE_EXTRA_ARGS ?= + +HELMFILE := helmfile $(HELMFILE_EXTRA_ARGS) + +# +# Help Targets +# + .PHONY: .check-helmfile-installed .check-helmfile-installed: ## Checks if helmfile is installed @if ! command -v helmfile >/dev/null 2>&1; then \ echo "'helmfile' is not installed. Install it to continue ...";\ fi +# +# Artifacts +# + helmfile.yaml: simcore-charts/helmfile.yaml ## Copies the helmfile.yaml to the charts directory cp $(CONFIG_DIR)/$@ $(REPO_BASE_DIR)/charts/helmfile.yaml simcore-charts/helmfile.yaml: ## Copies the simcore helmfile to the charts directory cp $(CONFIG_DIR)/helmfile.simcore.yaml $(REPO_BASE_DIR)/charts/$@ +# +# Targets +# + .PHONY: helmfile-lint helmfile-lint: .check-helmfile-installed helmfile.yaml ## Lints the helmfile set -a; source $(REPO_CONFIG_LOCATION); set +a; \ - helmfile lint + $(HELMFILE) lint .PHONY: helmfile-apply helmfile-apply: .check-helmfile-installed helmfile.yaml ## Applies the helmfile configuration set -a; source $(REPO_CONFIG_LOCATION); set +a; \ - helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply + $(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply .PHONY: helmfile-sync helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile configuration (use `helmfile-apply` to deploy the app) set -a; source $(REPO_CONFIG_LOCATION); set +a; \ - helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync + $(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync .PHONY: helmfile-diff helmfile-diff: .check-helmfile-installed helmfile.yaml ## Shows the differences that would be applied by helmfile @set -a; source $(REPO_CONFIG_LOCATION); set +a; \ - helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff + $(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff .PHONY: helmfile-delete helmfile-delete: .check-helmfile-installed helmfile.yaml ## Deletes the helmfile configuration @set -a; source $(REPO_CONFIG_LOCATION); set +a; \ - helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete - -.PHONY: up -up: helmfile-apply ## Start the stack - -.PHONY: leave -leave: ## Leaves kind cluster - kind delete clusters kind + $(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete From 22a12111b48bf48c9887d03d2a442272fa49a590 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Wed, 1 Oct 2025 09:35:44 +0200 Subject: [PATCH 3/7] Enable traefik metrics & update monitoring ns --- charts/kube-prometheus-stack/namespaces.yaml | 15 +++++++++++++++ charts/monitoring/namespace.yaml | 6 ------ charts/traefik/values.common.yaml.gotmpl | 7 +++++++ 3 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 charts/kube-prometheus-stack/namespaces.yaml delete mode 100644 charts/monitoring/namespace.yaml diff --git a/charts/kube-prometheus-stack/namespaces.yaml b/charts/kube-prometheus-stack/namespaces.yaml new file mode 100644 index 00000000..c4da2db8 --- /dev/null +++ b/charts/kube-prometheus-stack/namespaces.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + pod-security.kubernetes.io/enforce: restricted + +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring-privileged + labels: + pod-security.kubernetes.io/enforce: privileged diff --git a/charts/monitoring/namespace.yaml b/charts/monitoring/namespace.yaml deleted file mode 100644 index eb788b6a..00000000 --- a/charts/monitoring/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: monitoring - labels: - pod-security.kubernetes.io/enforce: restricted diff --git a/charts/traefik/values.common.yaml.gotmpl b/charts/traefik/values.common.yaml.gotmpl index a40c8cc7..bb00686e 100644 --- a/charts/traefik/values.common.yaml.gotmpl +++ b/charts/traefik/values.common.yaml.gotmpl @@ -35,3 +35,10 @@ affinity: # https://github.com/traefik/traefik-helm-chart/blob/v28.2.0/traefik/ app.kubernetes.io/name: '{{`{{ template "traefik.name" . }}`}}' app.kubernetes.io/instance: '{{ .Release.Name }}' topologyKey: kubernetes.io/hostname + +metrics: + prometheus: + service: + enabled: true + serviceMonitor: + enabled: true From 1b15321d5146bdced369d97c1a64d0a85427306e Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Wed, 1 Oct 2025 09:37:41 +0200 Subject: [PATCH 4/7] add todo comments --- .../kube-prometheus-stack/values.yaml.gotmpl | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/charts/kube-prometheus-stack/values.yaml.gotmpl b/charts/kube-prometheus-stack/values.yaml.gotmpl index 8cbb8988..dd037dd5 100644 --- a/charts/kube-prometheus-stack/values.yaml.gotmpl +++ b/charts/kube-prometheus-stack/values.yaml.gotmpl @@ -10,8 +10,9 @@ grafana: kubeApiServer: enabled: false +# container metrics (cpu / memory) kubelet: - enabled: false + enabled: true kubeControllerManager: enabled: false @@ -37,15 +38,18 @@ kubeStateMetrics: nodeExporter: enabled: true +prometheus-node-exporter: + namespaceOverride: "{{ .Release.Namespace }}-privileged" + thanosRuler: enabled: false prometheusOperator: enabled: true - networkPolicy: &networkPolicy + networkPolicy: enabled: true - flavor: Kubernetes + flavor: kubernetes resources: limits: @@ -61,14 +65,45 @@ prometheusOperator: prometheus: enabled: true - networkPolicy: *networkPolicy - - # enable once needed (object storage) + networkPolicy: + enabled: true + flavor: kubernetes + + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: traefik + podSelector: + matchLabels: + app.kubernetes.io/name: traefik + ports: + # TODO: removed hardcode + - port: 9090 + protocol: TCP + egress: + - ports: + # generic port for metrics + - port: 9100 + protocol: TCP + - ports: + # prometheus operator + # TODO: removed hardcode + - port: 10250 + protocol: TCP + - ports: + # kube api server + # TODO: removed hardcode + - port: 6443 + protocol: TCP + + # enable once object storage needed thanosService: enabled: false ingress: enabled: true + ingressClassName: "" annotations: namespace: "{{ .Release.Namespace }}" cert-manager.io/cluster-issuer: "cert-issuer" @@ -81,10 +116,13 @@ prometheus: hosts: - {{ requiredEnv "K8S_MONITORING_FQDN" }} paths: - - /prometheus &pathprefix + - &pathprefix /prometheus pathType: Prefix prometheusSpec: + # no high availability in favor of resource savings + replicas: 1 + routePrefix: *pathprefix retention: 90d @@ -95,3 +133,18 @@ prometheus: nodeSelector: ops: "true" + + # without this not all service monitors are picked up + # when enabled chart applies default `matchLabels` but we + # don't want this. So we disable it + serviceMonitorSelectorNilUsesHelmValues: false + + resources: + requests: + memory: 2Gi + cpu: 1 + limits: + memory: 4Gi + cpu: 2 + + # TODO: add persistent storage From a18990d19be6c322992695092721ec466c7f845e Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Thu, 2 Oct 2025 16:14:13 +0200 Subject: [PATCH 5/7] add persistence --- .../kube-prometheus-stack/values.ebs-storage.yaml.gotmpl | 9 +++++++++ .../values.kind-local-storage.yaml.gotmpl | 9 +++++++++ .../values.topolvm-storage.yaml.gotmpl | 9 +++++++++ charts/kube-prometheus-stack/values.yaml.gotmpl | 4 +++- 4 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 charts/kube-prometheus-stack/values.ebs-storage.yaml.gotmpl create mode 100644 charts/kube-prometheus-stack/values.kind-local-storage.yaml.gotmpl create mode 100644 charts/kube-prometheus-stack/values.topolvm-storage.yaml.gotmpl diff --git a/charts/kube-prometheus-stack/values.ebs-storage.yaml.gotmpl b/charts/kube-prometheus-stack/values.ebs-storage.yaml.gotmpl new file mode 100644 index 00000000..20ad4d37 --- /dev/null +++ b/charts/kube-prometheus-stack/values.ebs-storage.yaml.gotmpl @@ -0,0 +1,9 @@ +prometheus: + prometheusSpec: + storageSpec: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 100Gi + storageClassName: "{{ .Values.ebsStorageClassName }}" diff --git a/charts/kube-prometheus-stack/values.kind-local-storage.yaml.gotmpl b/charts/kube-prometheus-stack/values.kind-local-storage.yaml.gotmpl new file mode 100644 index 00000000..5c25882e --- /dev/null +++ b/charts/kube-prometheus-stack/values.kind-local-storage.yaml.gotmpl @@ -0,0 +1,9 @@ +prometheus: + prometheusSpec: + storageSpec: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + storageClassName: "{{ .Values.kindDefaultStorageClassName }}" diff --git a/charts/kube-prometheus-stack/values.topolvm-storage.yaml.gotmpl b/charts/kube-prometheus-stack/values.topolvm-storage.yaml.gotmpl new file mode 100644 index 00000000..893e2f6c --- /dev/null +++ b/charts/kube-prometheus-stack/values.topolvm-storage.yaml.gotmpl @@ -0,0 +1,9 @@ +prometheus: + prometheusSpec: + storageSpec: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 100Gi + storageClassName: "{{ .Values.topolvmStorageClassName }}" diff --git a/charts/kube-prometheus-stack/values.yaml.gotmpl b/charts/kube-prometheus-stack/values.yaml.gotmpl index dd037dd5..66e2d386 100644 --- a/charts/kube-prometheus-stack/values.yaml.gotmpl +++ b/charts/kube-prometheus-stack/values.yaml.gotmpl @@ -147,4 +147,6 @@ prometheus: memory: 4Gi cpu: 2 - # TODO: add persistent storage + persistentVolumeClaimRetentionPolicy: + whenDeleted: Retain + whenScaled: Retain From 3baa8a43cc2c342bfbb03551c0c14f5cac162bd7 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Thu, 2 Oct 2025 16:47:25 +0200 Subject: [PATCH 6/7] Remove hardcode & add persistentVolumeClaimRetentionPolicy --- charts/cert-manager/templates/networkpolicy.yaml | 2 +- charts/cert-manager/values.common.yaml.gotmpl | 2 ++ charts/kube-prometheus-stack/values.yaml.gotmpl | 15 +++++++++------ charts/portainer/templates/networkpolicy.yaml | 2 +- charts/portainer/values.yaml.gotmpl | 2 ++ 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/charts/cert-manager/templates/networkpolicy.yaml b/charts/cert-manager/templates/networkpolicy.yaml index 7d9c347c..4025f5f2 100644 --- a/charts/cert-manager/templates/networkpolicy.yaml +++ b/charts/cert-manager/templates/networkpolicy.yaml @@ -31,7 +31,7 @@ spec: - 172.16.0.0/12 - 192.168.0.0/16 ports: - - 6443 + - {{ .Values.kubeApiServerPort }} # 6. TCP: cert-manager (controller) -> DNS API endpoints (for ACME DNS01) - action: Allow protocol: TCP diff --git a/charts/cert-manager/values.common.yaml.gotmpl b/charts/cert-manager/values.common.yaml.gotmpl index ba797108..8ad08456 100644 --- a/charts/cert-manager/values.common.yaml.gotmpl +++ b/charts/cert-manager/values.common.yaml.gotmpl @@ -1,3 +1,5 @@ +kubeApiServerPort: {{ .Values.kubeApiServerPort }} + cert-manager: crds: enabled: true diff --git a/charts/kube-prometheus-stack/values.yaml.gotmpl b/charts/kube-prometheus-stack/values.yaml.gotmpl index 66e2d386..3c4bcd7d 100644 --- a/charts/kube-prometheus-stack/values.yaml.gotmpl +++ b/charts/kube-prometheus-stack/values.yaml.gotmpl @@ -62,9 +62,15 @@ prometheusOperator: nodeSelector: ops: "true" + tls: + internalPort: &prometheusOperatorInternalPort 10250 + prometheus: enabled: true + service: + port: &prometheusServicePort 9090 + networkPolicy: enabled: true flavor: kubernetes @@ -78,8 +84,7 @@ prometheus: matchLabels: app.kubernetes.io/name: traefik ports: - # TODO: removed hardcode - - port: 9090 + - port: *prometheusServicePort protocol: TCP egress: - ports: @@ -88,13 +93,11 @@ prometheus: protocol: TCP - ports: # prometheus operator - # TODO: removed hardcode - - port: 10250 + - port: *prometheusOperatorInternalPort protocol: TCP - ports: # kube api server - # TODO: removed hardcode - - port: 6443 + - port: {{ .Values.kubeApiServerPort }} protocol: TCP # enable once object storage needed diff --git a/charts/portainer/templates/networkpolicy.yaml b/charts/portainer/templates/networkpolicy.yaml index 6b21b510..ce856643 100644 --- a/charts/portainer/templates/networkpolicy.yaml +++ b/charts/portainer/templates/networkpolicy.yaml @@ -13,7 +13,7 @@ spec: # connect to the Kubernetes API server destination: ports: - - 6443 + - {{ .Values.kubeApiServerPort }} nets: - 10.0.0.0/8 - 172.16.0.0/12 diff --git a/charts/portainer/values.yaml.gotmpl b/charts/portainer/values.yaml.gotmpl index 303d084e..aa8be5a0 100644 --- a/charts/portainer/values.yaml.gotmpl +++ b/charts/portainer/values.yaml.gotmpl @@ -1,5 +1,7 @@ servicePort: &servicePort 9000 +kubeApiServerPort: {{ .Values.kubeApiServerPort }} + portainer: replicaCount: 1 From ca98831130876ce752ec5e59e96c806d16a90a27 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Thu, 16 Oct 2025 10:50:08 +0200 Subject: [PATCH 7/7] Document ha concerns --- charts/kube-prometheus-stack/README.md | 19 +++++++++++++++++ .../kube-prometheus-stack/values.yaml.gotmpl | 21 ++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 charts/kube-prometheus-stack/README.md diff --git a/charts/kube-prometheus-stack/README.md b/charts/kube-prometheus-stack/README.md new file mode 100644 index 00000000..ebeda747 --- /dev/null +++ b/charts/kube-prometheus-stack/README.md @@ -0,0 +1,19 @@ + +## High Availability + +Prometheus Server +* Issue asking how to configure it in `kube-prometheus-stack` https://github.com/prometheus-community/helm-charts/issues/6184 +* Prometheus Operator Documentation https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus + +Promethes Operator +* Not needed. See https://github.com/prometheus-operator/prometheus-operator/issues/2491 + +## FAQ + +How to expose workload metrics +* Use ServiceMonitor, PodMonitor or Running exporters. See https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape +* Make sure network policy of prometheus and workload all all necessary ingress and egress + - prometheus shall be able to egress for metrics and workload should allow ingress for metrics + +Pod Monitor vs Service Monitor: +* https://github.com/prometheus-operator/prometheus-operator/issues/3119 diff --git a/charts/kube-prometheus-stack/values.yaml.gotmpl b/charts/kube-prometheus-stack/values.yaml.gotmpl index 3c4bcd7d..867f1832 100644 --- a/charts/kube-prometheus-stack/values.yaml.gotmpl +++ b/charts/kube-prometheus-stack/values.yaml.gotmpl @@ -123,23 +123,34 @@ prometheus: pathType: Prefix prometheusSpec: - # no high availability in favor of resource savings + # Use 2+ for HA replicas: 1 + # Done for HA + # Needs to differentiate prometheus instances with the same setup + # https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus + # External Labels do not show up metrics. See https://github.com/prometheus-operator/prometheus-operator/issues/2918#issuecomment-567009499 + replicaExternalLabelName: "prometheus_replica" + + # Done for HA + # Enforce replicas running on different nodes + # Otherwise it does not make sense from HA perspective + podAntiAffinity: "hard" + routePrefix: *pathprefix retention: 90d retentionSize: 100GiB - scrape_interval: 15s + scrapeInterval: 30s + scrapeTimeout: 10s nodeSelector: ops: "true" - # without this not all service monitors are picked up - # when enabled chart applies default `matchLabels` but we - # don't want this. So we disable it + # https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape + podMonitorSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false resources: