diff --git a/.dockerignore b/.dockerignore index 796b96d1..b8cad4f8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ /build +/deployment diff --git a/.gitignore b/.gitignore index fdf68e8c..beb7c8ba 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,8 @@ latex/ .vs/ .idea/ build -src/simdjson \ No newline at end of file +src/simdjson +/deployment/pcm/smarter-device-manager/ +/deployment/pcm/nri/ +/deployment/pcm/pcm-dashboard.json +/deployment/pcm/_kind* diff --git a/Dockerfile.debug b/Dockerfile.debug new file mode 100644 index 00000000..8b3e04ed --- /dev/null +++ b/Dockerfile.debug @@ -0,0 +1,7 @@ +FROM fedora:40@sha256:4e007f288dce23966216be81ef62ba05d139b9338f327c1d1c73b7167dd47312 as builder + +RUN dnf -y install gcc-c++ git findutils make cmake strace gdb util-linux +COPY . /tmp/pcm +RUN --mount=type=cache,target=/tmp/pcm/build cd /tmp/pcm/build && cmake -D CMAKE_BUILD_TYPE=Debug .. && cmake --build . -t pcm pcm-sensor-server pcm-tpmi -j && mkdir -p /usr/local/bin && cp -v /tmp/pcm/build/bin/pcm* /usr/local/bin/ +#ENV PCM_NO_PERF=1 +ENTRYPOINT [ "/usr/local/bin/pcm-sensor-server", "-p", "9738", "-r" ] diff --git a/deployment/pcm/.helmignore b/deployment/pcm/.helmignore new file mode 100644 index 00000000..5dfaad8e --- /dev/null +++ b/deployment/pcm/.helmignore @@ -0,0 +1,26 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +smarter-device-manager/ +nri/ +autoscaler/ diff --git a/deployment/pcm/Chart.yaml b/deployment/pcm/Chart.yaml new file mode 100644 index 00000000..685e8b4d --- /dev/null +++ b/deployment/pcm/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: pcm +version: 0.1.0 +appVersion: "202404" +description: A PCM Helm chart for Kubernetes +home: https://github.com/intel/pcm +maintainers: + - name: Pawel Palucki + email: pawel.palucki@intel.com diff --git a/deployment/pcm/LICENSE b/deployment/pcm/LICENSE new file mode 100644 index 00000000..2d994393 --- /dev/null +++ b/deployment/pcm/LICENSE @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2009-2024, Intel Corporation +Copyright (c) 2016-2020, opcm +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/deployment/pcm/Makefile b/deployment/pcm/Makefile new file mode 100644 index 00000000..3ddcd322 --- /dev/null +++ b/deployment/pcm/Makefile @@ -0,0 +1,129 @@ + +chart-lint-report.txt: values.yaml templates + docker run -ti --rm -w /pcm -v `realpath $(PWD)/../..`:/pcm quay.io/helmpack/chart-testing ct lint --charts deployment/pcm --validate-maintainers=false | tee chart-lint-report.txt + +# +# kind cluster targets +# +# https://stackoverflow.com/questions/649246/is-it-possible-to-create-a-multi-line-string-variable-in-a-makefile +define KIND_EXTRA_MOUNTS +nodes: +- role: control-plane + extraMounts: + - hostPath: /sys/fs/resctrl + containerPath: /sys/fs/resctrl +endef + +export KIND_EXTRA_MOUNTS +_kind_with_registry.sh: + curl -sl https://kind.sigs.k8s.io/examples/kind-with-registry.sh -o _kind_with_registry.sh.tmp + echo "$$KIND_EXTRA_MOUNTS" >_kind_extra_mounts.txt + sed '/apiVersion: kind.x-k8s.io\/v1alpha4/r _kind_extra_mounts.txt' _kind_with_registry.sh.tmp >_kind_with_registry.sh + chmod +x _kind_with_registry.sh + +_kind_deploy_cluster: _kind_with_registry.sh + ./_kind_with_registry.sh + kind export kubeconfig + touch _kind_deploy_cluster + + +# +# 1) e2e-default: minimal E2e pcm pod only test +# +kind_deploy_pcm: + helm install pcm . + kubectl wait daemonset pcm --for=jsonpath='{.status.numberReady}'=1 + +kind_pcm_test: + helm test pcm + +e2e-default: _kind_deploy_cluster kind_deploy_pcm kind_pcm_test + +# +# 2) e2e-default-local-image: minimal E2e pcm with local image build +# +build_local_image: + (cd ../.. ; docker build . -t localhost:5001/pcm-local) + docker push localhost:5001/pcm-local + +kind_deploy_pcm_local_image: + helm upgrade --install --reset-values --wait pcm . -f values-local-image.yaml + kubectl wait daemonset pcm --for=jsonpath='{.spec.template.spec.containers[0].image'}=localhost:5001/pcm-local:latest + kubectl wait daemonset pcm --for=jsonpath='{.status.numberReady}'=1 + +e2e-default-local-image: _kind_deploy_cluster build_local_image kind_deploy_pcm_local_image kind_pcm_test + +# +# 3) e2e-prometheus: E2E test for podMonitor (pod monitor test) +# +_kind_deploy_prometheus: + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm upgrade --install --reset-values prometheus prometheus-community/kube-prometheus-stack --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false --wait + kubectl wait Prometheus prometheus-kube-prometheus-prometheus --for=jsonpath='{.status.availableReplicas}'=1 + #kubectl wait sts prometheus-prometheus-kube-prometheus-prometheus --for=jsonpath='{.status.replicas}'=1 + touch _kind_deploy_prometheus + +kind_deploy_pcm_with_prometheus: + helm upgrade --install --reset-values pcm . --set podMonitor=true + kubectl wait daemonset pcm --for=jsonpath='{.status.numberReady}'=1 + +kind_pcm_test_prometheus: + kubectl proxy & sleep 10 && curl -sL http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy/api/v1/query?query=Measurement_Interval_in_us | grep Measurement_Interval_in_us && kill %1 + +e2e-prometheus: _kind_deploy_cluster _kind_deploy_prometheus kind_deploy_pcm_with_prometheus kind_pcm_test kind_pcm_test_prometheus + +# +# 4) e2e-metal-nfd: e2e thats tests that with node-feature-discovery installed and nfd values are changed, the PCM will be only installed on non hyperviserd system with Intel vendor and RDT available +# +_kind_deploy_nfd: + #kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.0-devel + helm repo add nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts + helm upgrade --install --wait nfd nfd/node-feature-discovery --namespace node-feature-discovery --create-namespace + # please be patient NFD requires around 2 minutes to annotate the node ... + kubectl wait node --timeout=2m kind-control-plane --for=jsonpath='{.metadata.labels.feature\.node\.kubernetes\.io\/cpu-model\.vendor_id}'=Intel + +kind_deploy_pcm_with_metal_nfd: + helm upgrade --install --reset-values pcm . -f values-metal-nfd.yaml + kubectl wait daemonset --timeout=2m pcm --for=jsonpath='{.status.numberReady}'=1 + +kind_pcm_test_nfd: + kubectl wait daemonset pcm --timeout=2m --for=jsonpath='{.spec.template.spec.nodeSelector.feature\.node\.kubernetes\.io\/cpu-model\.vendor_id}'=Intel + helm test pcm + +e2e-metal-nfd: _kind_deploy_cluster _kind_deploy_nfd kind_deploy_pcm_with_metal_nfd kind_pcm_test + +# +# 5) e2e-vpa: VPA E2E tests +# +_kind_autoscaler: + git clone --depth 1 --single-branch https://github.com/kubernetes/autoscaler _kind_autoscaler + +_kind_deploy_metrics_server: + helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ + helm repo update + helm upgrade --install --reset-values --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system + kubectl wait daemonset pcm --for=jsonpath='{.status.numberReady}'=1 + touch _kind_deploy_metrics_server + +_kind_deploy_vpa: autoscaler + ./_kind_autoscaler/vertical-pod-autoscaler/hack/vpa-up.sh + touch _kind_deploy_vpa + +kind_deploy_pcm_with_vpa: + helm upgrade --install --reset-values pcm . --set verticalPodAutoscaler.enabled=true + kubectl wait daemonset pcm --for=jsonpath='{.status.numberReady}'=1 + +e2e-vpa: _kind_deploy_cluster _kind_deploy_vpa kind_deploy_pcm_with_vpa kind_pcm_test + +# +# Cleanup +# + +clean: + kind delete cluster + docker rm -f kind-registry + rm -fv _kind_with_registry.sh + rm -fv _kind_extra_mounts.txt + rm -fv _kind_with_registry.sh.tmp + rm -fv _kind_deploy_cluster + rm -fv _kind_deploy_prometheus diff --git a/deployment/pcm/README.md b/deployment/pcm/README.md new file mode 100644 index 00000000..510ad941 --- /dev/null +++ b/deployment/pcm/README.md @@ -0,0 +1,429 @@ +-------------------------------------------------------------------------------- +Helm chart instructions +-------------------------------------------------------------------------------- + +### Features: + +- Configurable as non-privileged container (value: `privileged`, default: false) and privileged container, +- Support for bare-metal and VM host configurations (files: [values-metal-nfd.yaml](values-metal.yaml), [values-vm.yaml](values-vm.yaml)), +- Ability to deploy multiple releases alongside configured differently to handle different kinds of machines (bare-metal, VM) at the [same time](#heterogeneous-mixed-vmmetal-instances-cluster), +- Linux Watchdog handling (controlled with `PCM_KEEP_NMI_WATCHDOG`, `PCM_NO_AWS_WORKAROUND`, `nmiWatchdogMount` values). +- Deploy to own namespace with "helm install ... **-n pcm --create-namespace**". +- Silent mode (value: `silent`, default: false). +- Backward compatible with older Linux kernels (<5.8) - (value: cap_perfmon, default: false). +- VerticalPodAutoscaler (value: `verticalPodAutoscaler.enabled`, default: false) + +Here are available methods in this chart of metrics collection w.r.t interfaces and required access: + +| Method | Used interfaces | default | Notes | instructions | +|-------------------------|----------------------| ------- | ------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------- | +| unprivileged "indirect" | perf, resctrl | v | recommended, missing metrics: energy metrics (TODO link to issues/PR or node_exporter/rapl_collector) | `helm install . pcm` | +| privileged "indirect" | perf, resctrl | | not recommended, unsecure, no advantages over unprivileged), missing metrics: energy metrics | `helm install . pcm --set privileged=true` | +| privileged "direct" | msr | | not recommended, unsecure and requires msr module pre loaded on host | `helm install . pcm -f values-direct-privileged.yaml` | +| unprivileged "direct" | msr | | not recommended, requires msr module and access to /dev/cpu and /dev/mem (non trivial, like using 3rd plugins) | [link for detailed documentation](docs/direct-unprivileged-deployment.md) | + +For more information about direct/indirect collection methods please see [here](#metric-collection-methods-capabilites-vs-requirements) + +#### Integration features: + +- node-feature-discovery based nodeSelector and nodeAffinity (values: `nfd`, `nfdBaremetalAffinity`, `nfdRDTAffinity`), +- Examples for non-privileged mode using device plugin ("smarter-devices-manager") or using NRI device-injector plugin (TODO) (file: [values-smarter-devices-cpu-mem.yaml](values-smarter-devices-cpu-mem.yaml) ), +- Integration with NRI balloons policy plugin (value: `nriBalloonsPolicyIntegration`), + +#### Debugging features: + +- Local image registry for development (file: [values-local-image.yaml](values-local-image.yaml) ), +- Deploy Prometheus operator' PodMonitor (value: `podMonitor`) + +### Getting started + +#### Indirect non-privileged method using Linux abstractions (perf/resctrl) default. + +```sh +helm install pcm . +``` + +#### Direct privileged method +``` +helm install pcm . -f values-direct-privileged.yaml +``` + +#### All opt-in features: Node-feature-discovery + Prometheus podMonitor + vertical + +``` +helm install ... --set nfd=true --set podMonitor=true --set verticalPodAutoscaler.enabled=true +``` + +### Requirements + +- Full set of metrics (uncore/UPI, RDT, energy) requires bare-metal or .metal cloud instance. +- /sys/fs/resctrl has to be mounted on host OS (for default indirect deployment method) +- pod is allowed to be run with privileged capabilities (SYS_ADMIN, SYS_RAWIO) on given namespace in other words: Pod Security Standards allow to run on privileged level, + +``` + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/audit-version: latest + pod-security.kubernetes.io/warn: privileged + pod-security.kubernetes.io/warn-version: latest +``` + +More information here: https://kubernetes.io/docs/tutorials/security/ns-level-pss/ . + +### Defaults + +- Indirect method uses Linux abstraction to access event counters (Linux Perf, resctrl) and run container in non-privileged mode. +- hostPort 9738 is exposed on host. (TODO: security review, consider TLS, together with Prometheus scrapping !!). +- Prometheus podMonitor is disabled (enabled it with --set podMonitor=true). + +### TLS + +TODO: +- requires pcm-sensor-server to be build with SSL support +- ERRROR !!!! + +``` +mkdir build +cd build +cmake .. -DCMAKE_CXX_FLAGS='-DUSE_SSL -lssl' +zypper install openssl-devel +make pcm-sensor-server -j +openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname" +./bin/pcm-sensor-server -s -p 8443 --certificateFile cert.pem --privateKeyFile key.pem +``` + +### Validation on local kind cluster + +#### Requirements + +- kubectl/kind/helm/jq binaries available in PATH, +- docker service up and running. +- full set of metrics available only bare-metal instance or Cloud .metal instance. + +#### 1) (Optionally) mount resctrl filesystem (for RDT metrics) to unload "msr" kernel module for validation + +``` +mount -t resctrl resctrl /sys/fs/resctrl +``` + +For validation to verify that all metrics are available without msr, unload "msr" module from kernel and perf_event_paranoid has default value +``` +rmmod msr +echo 2 > /proc/sys/kernel/perf_event_paranoid +cat /proc/sys/kernel/perf_event_paranoid # expected value 2 +``` + +#### 2) Create kind based Kubernetes cluster + +``` +kind create cluster +``` + +**Note** to be able to collect and test RDT metrics through resctrl filesystem, kind cluster have to be created with additional mounts: +``` +nodes: +- role: control-plane + extraMounts: + - hostPath: /sys/fs/resctrl + containerPath: /sys/fs/resctrl +``` +e.g. create kind cluster with local registry with [this script](https://kind.sigs.k8s.io/docs/user/local-registry/) +and apply the patch to enable resctrl win following way: + +``` +wget https://kind.sigs.k8s.io/examples/kind-with-registry.sh + +sed -i '/apiVersion: kind.x-k8s.io\/v1alpha4/a \ +nodes:\ +- role: control-plane\ + extraMounts:\ + - hostPath: /sys/fs/resctrl\ + containerPath: /sys/fs/resctrl\ +' kind-with-registry.sh +``` + +Then create cluster using above patched script: +``` +bash kind-with-registry.sh +``` + +Check that resctrl is available inside kind node: +``` +docker exec kind-control-plane ls /sys/fs/resctrl/info +# expected output: +# L3_MON +# MB +# ... +``` + + +and optionally local registry is running (to be used with local pcm build images, more detail [below](development-with-local-images-and-testing)) +``` +docker ps | grep kind-registry +# expected output: +# e57529be23ea registry:2 "/entrypoint.sh /etc…" 3 weeks ago Up 3 weeks 127.0.0.1:5001->5000/tcp kind-registry +``` + +Export kind kubeconfig as default for further kubectl commands: +``` +kind export kubeconfig +kubectl get pods -A +``` + +#### 3) (Optionally) Deploy Node Feature Discovery (nfd) + +``` +# I.a. Using Kustomize: +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.0-devel + +# I.b. or with Helm Chart: +helm repo add nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts +helm repo update +helm install nfd/node-feature-discovery --namespace node-feature-discovery --create-namespace --generate-name + +# II. Check node "labels" with CPU features are added +kubectl get node kind-control-plane -o yaml | grep feature.node +``` + +#### 4) (Optionally) Deploy Prometheus operator + +``` +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install prometheus prometheus-community/kube-prometheus-stack --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false +kubectl get sts prometheus-prometheus-kube-prometheus-prometheus +``` + +Note: `podMonitorSelectorNilUsesHelmValues` is disabled (set to false) so Prometheus operator will be able to handle PCM podMonitor deployed without extra `podMonitorLabels` or otherwise pcm need to be deployed like this: +`helm install pcm . --set podMonitor=true --set podMonitorLabels.release=prometheus` (assuming Prometheus operator was deployed as "prometheus") + + +#### 5) (Optionally) Deploy metric-server and vertical-pod-autoscaler + +Note this is irrelevant to pcm-sensor-server functionality, but useful to observer pcm pod CPU/memory usage: + +a) metric-server + +``` +helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ +helm repo update +helm upgrade --install --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system +``` + +b) vertical pod autoscaler + +``` +git clone https://github.com/kubernetes/autoscaler +./autoscaler/vertical-pod-autoscaler/hack/vpa-up.sh +``` + +#### 6) Deploy PCM helm chart + +``` +# a) Deploy to current namespace with defaults +helm install pcm . + +# b) Alternatively deploy with NFD and/or with Prometheus enabled +helm install pcm . --set podMonitor=true +helm install pcm . --set nfd=true + +# c) Alternatively deploy into own "pcm" namespace +helm install pcm . --namespace pcm +``` + +#### 7) Check metrics are exported + +Run proxy in background: +``` +kubectl proxy & +``` + +Access PCM metrics directly: + +```sh +kubectl get daemonset pcm +kubectl get pods +podname=`kubectl get pod -l app.kubernetes.io/component=pcm-sensor-server -ojsonpath='{.items[0].metadata.name}'` + +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep L3_Cache_Misses # source: core +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep DRAM_Writes # source: uncore +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep Local_Memory_Bandwidth{socket="1",aggregate="socket",source="core"} # source: RDT +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep DRAM_Joules_Consumed # source: energy +``` + +... or through Prometheus UI/prom tool (requires prometheus operator to be deployed and helm install with with `--set podMonitor=true`): +``` +http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy/graph +promtool query range --step 1m http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'rate(DRAM_Writes{aggregate="system"}[5m])/1e9' +promtool query instant http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'avg by(__name__) ({job="pcm"})' +``` + +... or through Grafana with generated dashboard: + +``` + + +# 1) Download dashboard +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/dashboard/prometheus -o pcm-dashboard.json + +# change default (too small) interval (from 4s to 2m, following Prometheus best practicies of rate being four times larger than scrapping 30s) +# References: +# https://grafana.com/blog/2020/09/28/new-in-grafana-7.2-__rate_interval-for-prometheus-rate-queries-that-just-work/ +# ($__rate_interval is 4 x scrape interval defined in datasource provisioned by prometheus operator, scrape internval is based on Prometheus object which defaults to 30s) +# - https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml#L1069 +# - https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml#L3381 +sed -i 's/4s/$__rate_interval/g' pcm-dashboard.json + +# 2) port forward with kubectl (--address=0.0.0.0) +kubectl port-forward -n default service/prometheus-grafana 8002:80 + +# 3) User: admin/prom-operator +# or get password kubectl get secret --namespace default prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo +http://127.0.0.1:8002 + +# 4) Go to Dashboards/New/Import and upload: + +pcm-dashboard.json + +``` + +### Deploy alternative options + +#### Direct (msr access) as privileged container +``` +helm install pcm . -f values-direct-privileged.yaml +``` + +#### Homogeneous bare metal instances cluster (full set of metrics) + +``` +helm install pcm . -f values-metal-nfd.yaml +``` + +#### Homogenizer VM instances cluster (limited set of metrics core) + +``` +helm install pcm . -f values-vm.yaml +``` + +#### Heterogeneous (mixed VM/metal instances) cluster + +values-metal-nfd.yaml requires node-feature-discovery to be preinstallaed +``` +helm install pcm-vm . -f values-vm.yaml +helm install pcm-metal . -f values-metal-nfd.yaml +``` + +#### Direct method as non-privileged container (not recommended) + +**Note** PCM requires access to /dev/cpu device in read-write mode (MSR access) but it is no possible currently to mount devices in Kubernetes pods/containers in vanilla Kubernetes for unprivileged containers. Please find more about this limitation https://github.com/kubernetes/kubernetes/issues/5607. + +To expose necessary devices to pcm-sensor-server, one can use: + +a) Kubernetes device plugin (using Kubernetes [CDI](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/) interface), +b) containerd plugin (using [NRI](https://github.com/containerd/nri/) interface), + +Examples can be find [here](docs/direct-unprivileged-deployment.md). + +#### Development (with local images) and testing + +1) Setup kind with registry following this instruction: https://kind.sigs.k8s.io/docs/user/local-registry/ +``` +wget https://kind.sigs.k8s.io/examples/kind-with-registry.sh +bash kind-with-registry.sh +``` + +2) Build docker image and upload to local registry + +``` +# optionally create buildx based builder +mkdir ~/.docker/cli-plugins +curl -sL https://github.com/docker/buildx/releases/download/v0.14.0/buildx-v0.14.0.linux-amd64 -o ~/.docker/cli-plugins/docker-buildx +chmod +x ~/.docker/cli-plugins/docker-buildx +docker buildx create --driver docker-container --name mydocker --use --bootstrap + +# Build production image from **project root directory**: +docker build . -t localhost:5001/pcm-local +docker push localhost:5001/pcm-local + +# Build/push **debug** image with single line +# Debug Dockerfile contains source code of pcm and some debugging utils (like gdb,strace for further analysis) +# Run from deployment/pcm/ directory: +(cd ../.. ; docker build . -f Dockerfile.debug -t localhost:5001/pcm-local && docker push localhost:5001/pcm-local) +``` + +3) When deploying to kind cluster pcm use values to switch to local pcm-local image +``` +helm install pcm . -f values-local-image.yaml +``` + +4) Replace pcm-sensor-server with pcm or sleep to be able to run `gdb` or `strace` for example +``` +helm upgrade --install pcm . --set debugPcm=true +helm upgrade --install pcm . --set debugSleep=true +``` + +**TODO:** consider debug options to be removed before release for security reasons + +5) Check logs or interact with container directly: +``` +# exec into pcm container +kubectl exec -ti ds/pcm -- bash +# or check logs +kubectl logs ds/pcm +``` + +6) Helm testing + +``` +helm test pcm + +# in case of failing, see the logs of test connection pod +# NOTE: filter is used to ignore service (helm limitation, which tries to download logs from service), so it assumes service exists, because previous run failed +helm test pcm --logs --filter name=pcm-test-connection + +# or run test-connection-pod manually +kubectl run -ti --rm --image busybox pcm-test-connection-manual -- sh +kubectl run -ti --rm --image busybox pcm-test-connection-manual -- ping pcm-test-connection.default.svc.cluster.local -t 1 -W 1 -w 1 -c 1 +kubectl run -ti --rm --image busybox pcm-test-connection-manual -- wget -S -T 15 pcm-test-connection.default.svc.cluster.local:9739/metrics +``` + +### Metric collection methods (capabilities vs requirements) + + + +| Metrics | Available on Hardware | Available through interface | Available through method | +| --------------------- | ----------------------------- | ---------------------------- | ------------------------ | +| core | bare-metal, VM (any) | msr or perf | any | +| uncore (UPI) | bare-metal, VM (all sockets) | msr or perf | any | +| RDT (MBW,L3OCCUP) | bare-metal, VM (all sockets) | msr or resctrl | any | +| energy, temp | bare-metal (only) | msr | direct | +| perf-topdown | | perf only | indirect | + + +| Interface | Requirements | Controlled by (env/helm value) | default helm | Used by source code | Notes | +|---------------|------------------------------------------------------------|---------------------------------|-----------------------|----------------------------------------------------------|-----------------------------------------------------| +| perf | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_NO_PERF | use perf | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-uncore | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_USE_UNCORE_PERF | use perf for uncore | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-topdown | /sys/bus/event_source/devices/cpu/events | sysMount | yes | cpucounters.cpp:perfSupportsTopDown() | TODO: conflicts with sys/fs/resctrl | +| RDT | uses "msr" or "resctrl" interface | PCM_NO_RDT | yes | cpucounters.cpp:isRDTDisabled()/QOSMetricAvailable() | | +| resctrl | RW: /sys/fs/resctrl | PCM_USE_RESCTRL | yes | resctrl.cpp | resctrlMount | +| watchdog | RO/RW: /proc/sys/kernel/nmi_watchdog | PCM_KEEP_NMI_WATCHDOG | yes (tries to disable)| src/cpucounters.cpp:disableNMIWatchdog() | | +| msr | RW: /dev/cpu/X/msr + privileged or CAP_ADMIN/CAP_RAWIO | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | privileged or some method to access /dev/cpu | +| | RW: /dev/mem | ? | msr is disabled | cpucounters.cpp:initUncoreObjects, pci.cpp:PCIHandleM() | privileged or some method to access /dev/cpu | +| | RO/RW: /sys/module/msr/parameters | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | sysMount | +| | RW: /proc/bus/pci | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PCIHandle() | pciMount | +| | RO: /sys/firmware/acpi/tables/MCFG | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PciHandle::openMcfgTable() | mcfgMount | +| | energy | | | cpucounters.cpp initEnergyMonitoring() | | + + +### E2E tests + +Following end to end tests based on kind enviornment are provided by make targets: + +- `e2e-default` - test PCM with default configuration (indirect) and checks connection by calling `helm test` +- `e2e-default-local-image` - same as above but build and deploys PCM with local image +- `e2e-prometheus` - test PCM chart with deployed PodMonitor with Prometheus stack and queries Prometheus for collected data, +- `e2e-vpa` - deploy PCM with VerticalPodAutoscaler (requires metrics-service to be deployed alongside) +- `e2e-metal-nfd` - test PCM chart on metal scheduled by features exposed by node-feature-discovery (uses: values-metal-nfd.yaml), diff --git a/deployment/pcm/docs/direct-unprivileged-deployment.md b/deployment/pcm/docs/direct-unprivileged-deployment.md new file mode 100644 index 00000000..fd760a17 --- /dev/null +++ b/deployment/pcm/docs/direct-unprivileged-deployment.md @@ -0,0 +1,67 @@ +-------------------------------------------------------------------------------- +Examples of deploying with direct MSR access as non-privileged container +-------------------------------------------------------------------------------- + +#### Direct method as non-privileged container (not recommended) + +##### a) Device injection using 3rd party device-plugin + +TO run PCM with as non privileged pod, we can third party devices plugins e.g.: + +- https://github.com/smarter-project/smarter-device-manager +- https://github.com/squat/generic-device-plugin +- https://github.com/everpeace/k8s-host-device-plugin + +**Warning** This plugins were NOT audited for security concerns, **use it at your own risk**. + +Below is example how to pass /dev/cpu and /dev/mem using smarter-device-manager in kind based Kubernetes test cluster. + +``` +# Label node to deploy device plugin on that node +kubectl label node kind-control-plane smarter-device-manager=enabled + +# Install "smarter-device-manager" device plugin with only /dev/cpu and /dev/mem devices enabled: +git clone https://github.com/smarter-project/smarter-device-manager +helm install smarter-device-plugin --create-namespace --namespace smarter-device-plugin smarter-device-manager/charts/smarter-device-manager --set 'config[0].devicematch=^cpu$' --set 'config[0].nummaxdevices=1' --set 'config[1].devicematch=^mem$' --set 'config[1].nummaxdevices=1' + +# Check that cpu and mem devices are available - should return "1" +kubectl get node kind-control-plane -o json | jq .status.capacity + +# Install pcm helm chart in unprivileged mode with extraResources for cpu and memory devices. +helm install pcm . -f docs/direct-unprivileged-examples/values-direct-unprivileged.yaml -f docs/direct-unprivileged-examples/values-smarter-devices-cpu-mem.yaml +``` + +##### b) Device injection using NRI plugin device-injection + +**TODO**: **Warning** This is work in progress, because it is needed to manually specific all /dev/cpu/XX/msr devices, which is unpractical in production (TO BE MOVED TO EXTERNAL FILE). + +``` +git clone https://github.com/containerd/nri/ +(cd nri/plugins/device-injector/ && go build ) +docker cp kind-control-plane:/etc/containerd/config.toml config.toml + +cat >>config.toml < 0.09090909090909094 @[1707901856.957] +Clock_Unhalted_Ref => 1010026077.3913049 @[1707901856.957] +Clock_Unhalted_Thread => 1295730425.8695648 @[1707901856.957] +DRAM_Joules_Consumed => 0 @[1707901856.957] +DRAM_Reads => 3600814506.6666665 @[1707901856.957] +DRAM_Writes => 1974366592 @[1707901856.957] +Embedded_DRAM_Reads => 0 @[1707901856.957] +Embedded_DRAM_Writes => 0 @[1707901856.957] +Incoming_Data_Traffic_On_Link_0 => 689786624 @[1707901856.957] +Incoming_Data_Traffic_On_Link_1 => 689454432 @[1707901856.957] +Incoming_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Instructions_Retired_Any => 749013885.5739133 @[1707901856.957] +Invariant_TSC => 432975372048881700 @[1707901856.957] +L2_Cache_Hits => 3531524.973913045 @[1707901856.957] +L2_Cache_Misses => 2334387.130434784 @[1707901856.957] +L3_Cache_Hits => 1325323.1739130428 @[1707901856.957] +L3_Cache_Misses => 627863.4000000003 @[1707901856.957] +L3_Cache_Occupancy => 0 @[1707901856.957] +Local_Memory_Bandwidth => 0 @[1707901856.957] +Measurement_Interval_in_us => 14507400443881 @[1707901856.957] +Memory_Controller_IO_Requests => 0 @[1707901856.957] +Number_of_sockets => 2 @[1707901856.957] +OS_ID => 55.499999999999986 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_0 => 1843333122.5 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_1 => 1849219231.5 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Package_Joules_Consumed => 0 @[1707901856.957] +Persistent_Memory_Reads => 0 @[1707901856.957] +Persistent_Memory_Writes => 0 @[1707901856.957] +RawCStateResidency => 89486131.66409859 @[1707901856.957] +Remote_Memory_Bandwidth => 0 @[1707901856.957] +SMI_Count => 0 @[1707901856.957] +Thermal_Headroom => -2147483648 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_0 => 0 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_1 => 0 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_0 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_1 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +``` diff --git a/deployment/pcm/templates/_helpers.tpl b/deployment/pcm/templates/_helpers.tpl new file mode 100644 index 00000000..fffa7025 --- /dev/null +++ b/deployment/pcm/templates/_helpers.tpl @@ -0,0 +1,79 @@ +{{/* Expand the name of the chart. */}} +{{- define "pcm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. */}} +{{- define "pcm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* Create chart name and version as used by the chart label. */}} +{{- define "pcm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* Selector labels */}} +{{- define "pcm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "pcm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/component: pcm-sensor-server +{{- end }} + +{{/* Common labels */}} +{{- define "pcm.labels" -}} +helm.sh/chart: {{ include "pcm.chart" . }} +{{ include "pcm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* SecurityContext privileged or capabilties */}} +{{- define "pcm.securityContext" -}} +securityContext: +{{- if .Values.privileged }} + privileged: true +{{- else -}} + {{/* TODO? + readOnlyRootFilesystem: false + runAsUser: 0 + runAsGroup: 0 + ## below two doesnt work on container level! + fsGroup: 0 + supplementalGroups: [0] + seccompProfile: + #type: RuntimeDefault + type: Unconfined + */}} + capabilities: + add: + - {{ if .Values.cap_perfmon }}PERFMON{{ else }}SYS_ADMIN{{ end }} + - SYS_RAWIO +{{- end }} +{{- end }} + + +{{/* Probes: liveness and readiness probe */}} +{{- define "pcm.probe" -}} +failureThreshold: 3 +httpGet: + path: / + port: 9738 + scheme: HTTP +periodSeconds: 10 +successThreshold: 1 +timeoutSeconds: 1 +{{- end }} diff --git a/deployment/pcm/templates/_tests/test-connection.yaml b/deployment/pcm/templates/_tests/test-connection.yaml new file mode 100644 index 00000000..3626676c --- /dev/null +++ b/deployment/pcm/templates/_tests/test-connection.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "pcm.fullname" . }}-test-connection" + #name: pcm + # labels: + #{{/* {{- include "pcm.labels" . | nindent 4 }} */}} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "2" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + containers: + - name: wget + image: busybox + command: + - 'sh' + - '-c' + - 'sleep 15; ping {{ include "pcm.fullname" . }}-test-connection.{{ .Release.Namespace }}.svc.cluster.local -t 1 -W 1 -w 1 -c 1 ; wget -T 15 -S {{ include "pcm.fullname" . }}-test-connection.{{ .Release.Namespace }}.svc.cluster.local:9739/metrics -O - | grep Measurement_Interval_in_us' + restartPolicy: Never diff --git a/deployment/pcm/templates/_tests/test-service.yaml b/deployment/pcm/templates/_tests/test-service.yaml new file mode 100644 index 00000000..8416c3f5 --- /dev/null +++ b/deployment/pcm/templates/_tests/test-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: "{{ include "pcm.fullname" . }}-test-connection" + labels: + {{- include "pcm.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + type: NodePort + ports: + - port: 9739 + targetPort: pcm-metrics + protocol: TCP + name: pcm-metrics + selector: + {{- include "pcm.selectorLabels" . | nindent 4 }} diff --git a/deployment/pcm/templates/daemonset.yaml b/deployment/pcm/templates/daemonset.yaml new file mode 100644 index 00000000..6625fd15 --- /dev/null +++ b/deployment/pcm/templates/daemonset.yaml @@ -0,0 +1,201 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "pcm.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "pcm.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "pcm.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "pcm.labels" . | nindent 8 }} + annotations: + {{- with .Values.podAnnotations }}{{- toYaml . | nindent 8 }}{{- end }} + {{- if .Values.nriBalloonsPolicyIntegration }} + cpu.preserve.resource-policy.nri.io: "true" + {{- end }} + spec: + nodeSelector: + {{- with .Values.nodeSelector -}}{{- toYaml . | nindent 8 -}}{{- end -}} + {{- if .Values.nfd }} + feature.node.kubernetes.io/cpu-model.vendor_id: Intel + {{- if .Values.nfdRDTAffinity }} + feature.node.kubernetes.io/cpu-rdt.RDTCMT: "true" + feature.node.kubernetes.io/cpu-rdt.RDTL3CA: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMBA: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMBM: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMON: "true" + {{- end }} + {{- if .Values.nfdBaremetalAffinity}} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR" + operator: DoesNotExist + {{- end }} + {{- end }} {{/* if nfd */}} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end -}} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + terminationGracePeriodSeconds: 0 + containers: + - name: pcm + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- include "pcm.securityContext" . | nindent 8 }} + {{- if .Values.silent }} + command: + - "/usr/local/bin/pcm-sensor-server" + - "-p" + - "9738" + - "-r" + - "-silent" + {{- end -}} + {{- if .Values.debugSleep }} + command: + - /usr/bin/sleep + - inf + {{- end -}} + {{- if .Values.debugPcm }} + command: + - /bin/bash + - -c + - "/usr/local/bin/pcm 2 -r -nc -nsys{{ if .Values.silent }} -silent{{ end }}" + {{- end -}} + {{- if .Values.resctrlInternalMount }} + # Ugly hack to mount resctrl inside only for baremetal when we want use resctrl abstraction and is not mounted on HOST: TBC conflicts with + command: + - /bin/bash + - -c + - "dnf install -q -y util-linux-core; mount -t resctrl resctrl /sys/fs/resctrl; /usr/local/bin/pcm-sensor-server -p 9738 -r" + {{- end -}} + {{/* ALREADY DONE by securityContext on pod level + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 12 }} + */}} + resources: + requests: + {{ with .Values.cpuRequest }}cpu: {{.}}{{ end }} + {{ with .Values.memoryRequest }}memory: {{.}}{{ end }} + {{- with .Values.extraResources }} {{- toYaml .requests | nindent 12 }} {{- end }} + limits: + {{ with .Values.cpuLimit }}cpu: {{.}}{{ end }} + {{ with .Values.memoryLimit }}memory: {{.}}{{ end }} + {{- with .Values.extraResources }} {{- toYaml .limits | nindent 12 }} {{- end }} + env: + - name: PCM_NO_MSR + value: {{ .Values.PCM_NO_MSR | quote }} + - name: PCM_NO_PERF + value: {{ .Values.PCM_NO_PERF | quote }} + - name: PCM_USE_UNCORE_PERF + value: {{ .Values.PCM_USE_UNCORE_PERF | quote }} + - name: PCM_NO_RDT + value: {{ .Values.PCM_NO_RDT | quote }} + - name: PCM_USE_RESCTRL + value: {{ .Values.PCM_USE_RESCTRL | quote }} + - name: PCM_IGNORE_ARCH_PERFMON + value: {{ .Values.PCM_IGNORE_ARCH_PERFMON | quote }} + - name: PCM_KEEP_NMI_WATCHDOG + value: {{ .Values.PCM_KEEP_NMI_WATCHDOG | quote }} + - name: PCM_NO_AWS_WORKAROUND + value: {{ .Values.PCM_NO_AWS_WORKAROUND | quote }} + - name: PCM_NO_UNCORE_PMU_DISCOVERY + value: {{ .Values.PCM_NO_UNCORE_PMU_DISCOVERY | quote }} + - name: PCM_PRINT_UNCORE_PMU_DISCOVERY + value: {{ .Values.PCM_PRINT_UNCORE_PMU_DISCOVERY | quote }} + - name: PCM_PRINT_TOPOLOGY + value: {{ .Values.PCM_PRINT_TOPOLOGY | quote }} + - name: PCM_NO_MAIN_EXCEPTION_HANDLER + value: {{ .Values.PCM_NO_MAIN_EXCEPTION_HANDLER | quote }} + {{- with .Values.probes }} + livenessProbe: + {{- include "pcm.probe" . | nindent 12 }} + readinessProbe: + {{- include "pcm.probe" . | nindent 12 }} + {{- end }} + {{- with .Values.hostPort }} + ports: + - containerPort: 9738 + hostPort: {{ . }} + name: pcm-metrics + protocol: TCP + {{- end }} + volumeMounts: + # {{- if .Values.privileged }} + # - mountPath: /pcm/dev/cpu + # name: dev-cpu + # readOnly: false + # - mountPath: /pcm/dev/mem + # name: dev-mem + # readOnly: false + # {{- end }} + {{- if .Values.pciMount }} + - mountPath: /pcm/proc/bus/pci + name: proc-pci + {{- end }} + {{- if .Values.sysMount }} + - mountPath: /pcm/sys + name: sysfs + readOnly: true + {{- end }} + {{- if .Values.nmiWatchdogMount }} + - mountPath: /pcm/proc/sys/kernel/nmi_watchdog + name: nmi-watchdog + readOnly: true # RW? # TODO + {{- end }} + {{- if .Values.resctrlMount }} + - mountPath: /sys/fs/resctrl + name: sysfs-resctrl + {{- end }} + # TODO: to be removed, already handled by /sysMount + # {{- if .Values.mcfgMount }} + # - mountPath: /pcm/sys/firmware/acpi/tables/MCFG + # name: sys-acpi + # readOnly: true + # {{- end }} + volumes: + # {{- if .Values.privileged }} + # - name: dev-cpu + # hostPath: + # path: /dev/cpu + # - name: dev-mem + # hostPath: + # path: /dev/mem + # {{- end}} + {{- if .Values.sysMount }} + - name: sysfs + hostPath: + path: /sys + {{- end}} + {{- if .Values.pciMount }} + - name: proc-pci + hostPath: + path: /proc/bus/pci + {{- end}} + {{- if .Values.nmiWatchdogMount }} + - name: nmi-watchdog + hostPath: + path: /proc/sys/kernel/nmi_watchdog + {{- end }} + # TODO: to be removed, already handled by /sysMount + # {{- if .Values.mcfgMount }} + # - name: sys-acpi + # hostPath: + # path: /sys/firmware/acpi/tables/MCFG + # {{- end }} + {{- if .Values.resctrlMount }} + - name: sysfs-resctrl + hostPath: + path: /sys/fs/resctrl + {{- end }} diff --git a/deployment/pcm/templates/podmonitor.yaml b/deployment/pcm/templates/podmonitor.yaml new file mode 100644 index 00000000..b9477e95 --- /dev/null +++ b/deployment/pcm/templates/podmonitor.yaml @@ -0,0 +1,41 @@ +{{- if .Values.podMonitor }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "pcm.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "pcm.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics + jobLabel: pcm + {{- with .Values.podMonitorLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + attachMetadata: + node: true + jobLabel: jobLabel + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + podMetricsEndpoints: + # requires hostPort to be set {{ required "A valid .Values.hostPort is required with PodMonitor enabled " .Values.hostPort }} + - enableHttp2: false + filterRunning: true + followRedirects: false + honorLabels: true + honorTimestamps: true + path: /metrics + port: pcm-metrics + interval: {{ .Values.podMonitorInterval | quote }} + relabelings: + - sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: nodename + scheme: http + selector: + matchLabels: + app.kubernetes.io/component: pcm-sensor-server + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/name: pcm +{{- end }} diff --git a/deployment/pcm/templates/verticalpodautoscaler.yaml b/deployment/pcm/templates/verticalpodautoscaler.yaml new file mode 100644 index 00000000..9d0941d9 --- /dev/null +++ b/deployment/pcm/templates/verticalpodautoscaler.yaml @@ -0,0 +1,40 @@ +{{- if and (.Capabilities.APIVersions.Has "autoscaling.k8s.io/v1") (.Values.verticalPodAutoscaler.enabled) }} +apiVersion: autoscaling.k8s.io/v1 +kind: VerticalPodAutoscaler +metadata: + name: {{ include "pcm.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "pcm.labels" . | nindent 4 }} +spec: + {{- with .Values.verticalPodAutoscaler.recommenders }} + recommenders: + {{- toYaml . | nindent 4 }} + {{- end }} + resourcePolicy: + containerPolicies: + - containerName: pcm + {{- with .Values.verticalPodAutoscaler.controlledResources }} + controlledResources: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.controlledValues }} + controlledValues: {{ . }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.maxAllowed }} + maxAllowed: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.minAllowed }} + minAllowed: + {{- toYaml . | nindent 8 }} + {{- end }} + targetRef: + apiVersion: apps/v1 + kind: DaemonSet + name: {{ include "pcm.fullname" . }} + {{- with .Values.verticalPodAutoscaler.updatePolicy }} + updatePolicy: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deployment/pcm/values-direct-privileged.yaml b/deployment/pcm/values-direct-privileged.yaml new file mode 100644 index 00000000..531224c6 --- /dev/null +++ b/deployment/pcm/values-direct-privileged.yaml @@ -0,0 +1,15 @@ +#### Tunning for "direct" privilaged access +privileged: true + +# Switch PCM to use msr access always +PCM_NO_MSR: 0 # use MSR +PCM_NO_PERF: 1 # do not use Linux perf +PCM_USE_UNCORE_PERF: 0 # also use MSR for uncore +PCM_NO_RDT: 0 # Enable RDT metrics ... +PCM_USE_RESCTRL: 0 # but using MSR (no resctrl filesystem) + +# with privileged container addtional mounts aren't required +resctrlMount: false # with MSR resctrl mount is not needed +resctrlInsideMount: false +sysMount: false +pciMount: false diff --git a/deployment/pcm/values-local-image.yaml b/deployment/pcm/values-local-image.yaml new file mode 100644 index 00000000..7d1c336d --- /dev/null +++ b/deployment/pcm/values-local-image.yaml @@ -0,0 +1,4 @@ +image: + repository: localhost:5001/pcm-local + tag: "latest" + pullPolicy: Always diff --git a/deployment/pcm/values-metal-nfd.yaml b/deployment/pcm/values-metal-nfd.yaml new file mode 100644 index 00000000..80a85bc5 --- /dev/null +++ b/deployment/pcm/values-metal-nfd.yaml @@ -0,0 +1,7 @@ +#### ================ Tunning for bare-metal instances ================ +# with node-feature-discovery node affinity for non hypervisor and RDT +nmiWatchdogMount: false +PCM_NO_AWS_WORKAROUND: 1 +PCM_KEEP_NMI_WATCHDOG: 0 +nfd: true +nfdBaremetalAffinity: true diff --git a/deployment/pcm/values-vm.yaml b/deployment/pcm/values-vm.yaml new file mode 100644 index 00000000..e9a43327 --- /dev/null +++ b/deployment/pcm/values-vm.yaml @@ -0,0 +1,6 @@ +#### ================ Tunning for VM ================ +nmiWatchdogMount: true + +# Disable RDT because is not avaiable for VM instances +PCM_NO_RDT: 1 +resctrlMount: false diff --git a/deployment/pcm/values.yaml b/deployment/pcm/values.yaml new file mode 100644 index 00000000..1bbb9607 --- /dev/null +++ b/deployment/pcm/values.yaml @@ -0,0 +1,167 @@ +### -------------- Naming ------------------- +# Used in: +# - common label: app.kubernetes.io/name otherwise "Chart name" +# - also in selectorLabels together with release.name +# defaults to "Chart.name" +nameOverride: "" +# Used as daemonset name (usually based on truncated "name + release name") +fullnameOverride: "" + +### -------------- Image options ------------ +image: + repository: ghcr.io/intel/pcm + pullPolicy: IfNotPresent + tag: "latest" # uses .Chart.AppVersion if empty +imagePullSecrets: {} + +### -------------- Security ------------------ +# Configures SecurityContext to not privileged (by default) so SYS_ADMIN/SYS_RAWIO capabilietes are required for running pod +privileged: false + +# Use new kernel 5.8+ PERFMON (least privileged) instead of generic SYS_ADMIN capability +# !Warning requires kernel 5.8+ +# more info here: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html#perf-events-access-control +cap_perfmon: true + +# Run pcm in silent mode (additional -silent argument to pcm-sensor-server binary) +# Removes some of debug outputs (like warnings about unability to open some /sys... /proc... files) +silent: false + +### -------------- Required OS affinity ------- +# Should only running on linux +nodeSelector: + kubernetes.io/os: linux + +### -------------- Probes --------------------- +probes: false + +### ================ Metrics configuration ====================== + +### -------------- Metrics: Uncore ------------ +# Mounts section +# NOTE: only required for direct mode +# required for uncore metrics discovery and working only in baremetal, not available for VM +sysMount: false # mounts host /sys into container /pcm/sys/ +pciMount: false # mounts host /proc/bus/pci into container /pcm/proc/bus/pci/ + +# NOTE this is only required for direct unprivileged mode ?!?!?! +# TODO: to be removed!!!?!?!!?!? (already coverred sysMounts !!!!) yes or not +mcfgMount: false # mounts hosts: /sys/firmware/acpi/tables/MCFG -> /pcm/sys/firmware/acpi/tables/MCFG + +### linux Perf (indirect) vs msr(direct) +# Lets try "indirect" as default +PCM_NO_MSR: 1 # do not use MSR +PCM_NO_PERF: 0 # use Linux Perf over MSR for core metrics +PCM_USE_UNCORE_PERF: 1 # use Linux Perf instead of MSR for uncore metrics (collection+detection) + +### -------------- Metrics: RDT --------------- +### RDT rdt/resctrl: +PCM_NO_RDT: 0 # 0 - try to collect RDT data, enables local/remote memory bandwidth + llc occupancy +PCM_USE_RESCTRL: 1 # use Linux Perf instead of MSR access (more reliable) +# required for indirect RDT access, not available for VM only in baremetal +# do not mount by default RDT can be also accessed through direct MSR programming +resctrlMount: true # mount from external host +resctrlInsideMount: false # TODO: mount inside with extra call to mount, requires image with mount installed - doesn't require + +### -------------- Other (NMI handling and/or on VM/AWS) +PCM_IGNORE_ARCH_PERFMON: 0 # After VM is detected through CPUID (hypervisor flag) - check arch_perfmon flag to be also enabled - fail if not avaiable (0 - do check, 1 - disable check) +# 0: Disabling NMI watchdog since it consumes one hw-PMU counter, requires nmiWatchdogMount to be true +# 1: don't disable NMI watchdog (reducing the core metrics set) - prefferd for production usage! +# but even with 0 automatic AWS workround applies! +PCM_KEEP_NMI_WATCHDOG: 0 +# workaround: after VM is detected: "INFO: Reducing the number of programmable counters to 3 to workaround the fixed cycle counter virtualization issue on AWS.\n";) +# 1: disables workaround and tries to use four programable counters (without workaround on VM will pcm-sensor-server will hang) +# Please do not disable (value=1) on VMs +PCM_NO_AWS_WORKAROUND: 0 + +# mounting watchdog is recommened when PCM_KEEP_NMI_WATCHDOG=0 or we expect AWS workaround to be applied +nmiWatchdogMount: true + +### -------------- Other (Debugging options for uncore pmu discovery) +PCM_NO_UNCORE_PMU_DISCOVERY: 0 # skip 1: this is not required for direct privileged access and with 0 ends with WARNING enumaration failed +PCM_PRINT_UNCORE_PMU_DISCOVERY: 1 # show: discovered pmu +PCM_PRINT_TOPOLOGY: 0 # show individual CPU topology for each core (plenty of lines) +PCM_NO_MAIN_EXCEPTION_HANDLER: 0 # show full call stack of error + +### =============================== Optional POD fields no related to PCM =============================== +# Pod level +podAnnotations: {} +podLabels: {} +# Container level +tolerations: [] +# Resources cpu/mem +cpuLimit: 100m +cpuRequest: 100m +memoryLimit: 512Mi +memoryRequest: 256Mi +# requests, limits level need to be specified here +extraResources: {} + +### =============================== Integrations with other projects ==================================== +# +### -------------- Prometheus operator -------------------- +# Expose run containerPort "pcm-sensor-server -p 9738" as hostPort, can be empty to disable hostPort +hostPort: 9738 +# Deploy PromtheusOperator PodMonitor (requires hostPort to be not empty) +podMonitor: false +# Extra PodMonitor labels to let Prometheus operator filter based on that +# e.g. default "kube-prometheus-stack" helm chart requires additional release:"{name of chart release}" label in podMonitor to be considered +# here is example how to check extra labels required to be added to PodMonitor +# 1) kubectl get prometheus -o jsonpath='{.items[].spec.podMonitorSelector.matchLabels}' # e.g. release: prometheus +# 2) helm install pcm . --set podMonitor=true --set podMonitorLabels.release=prometheus +podMonitorLabels: {} +# Default interval for Prometheus scrapping configuration +podMonitorInterval: 30s + + +### -------------- NRI balloons policy plugin ------------- +# PCM deployment to be intergrated with NRI balloons resource policy intergration +# if true, will add special annotation to allow pcm pod use all the core, regardless NRI balloons policy rules. +nriBalloonsPolicyIntegration: false + +### ------------- node-feature-discovery ----------------- +# when enabled specific set of labels will be used as node selector (Intel vendor, RDT availability, baremetal) +nfd: false +# if enabled daemonset nodeAffinity will require node without feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR flag (requires nfd=true) +nfdBaremetalAffinity: false +# if enabled, following RDT labels will be required for scheduling (requires nfd=true) +# TODO: those labels are no longer available with default node-feature-discovery deployment +# feature.node.kubernetes.io/cpu-rdt.RDTCMT=true +# feature.node.kubernetes.io/cpu-rdt.RDTL3CA=true +# feature.node.kubernetes.io/cpu-rdt.RDTMBA=true +# feature.node.kubernetes.io/cpu-rdt.RDTMBM=true +# feature.node.kubernetes.io/cpu-rdt.RDTMON=true +nfdRDTAffinity: false + + +### -------------- verticalPodAutoscaler ------------------ +# Enable vertical pod autoscaler support for pcm-sensor-server +verticalPodAutoscaler: + enabled: false + + # Recommender responsible for generating recommendation for the object. + # List should be empty (then the default recommender will generate the recommendation) + # or contain exactly one recommender. + # recommenders: + # - name: custom-recommender-performance + + # List of resources that the vertical pod autoscaler can control. Defaults to cpu and memory + controlledResources: [] + # Specifies which resource values should be controlled: RequestsOnly or RequestsAndLimits. + # controlledValues: RequestsAndLimits + + # Define the max allowed resources for the pod + maxAllowed: {} + # cpu: 200m + # memory: 100Mi + # Define the min allowed resources for the pod + minAllowed: {} + # cpu: 200m + # memory: 100Mi + + # updatePolicy: + # Specifies minimal number of replicas which need to be alive for VPA Updater to attempt pod eviction + # minReplicas: 1 + # Specifies whether recommended updates are applied when a Pod is started and whether recommended updates + # are applied during the life of a Pod. Possible values are "Off", "Initial", "Recreate", and "Auto". + # updateMode: Auto diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0af9ad1c..a211d7cd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,7 +3,7 @@ # All pcm-* executables -set(PROJECT_NAMES pcm pcm-numa pcm-latency pcm-power pcm-msr pcm-memory pcm-tsx pcm-pcie pcm-core pcm-iio pcm-lspci pcm-pcicfg pcm-mmio pcm-tpmi pcm-raw pcm-accel) +set(PROJECT_NAMES pcm pcm-numa pcm-latency pcm-power pcm-msr pcm-memory pcm-tsx pcm-pcie pcm-core pcm-iio pcm-lspci pcm-pcicfg pcm-mmio pcm-tpmi pcm-raw pcm-accel dashboardtest) file(GLOB COMMON_SOURCES pcm-accel-common.cpp msr.cpp cpucounters.cpp pci.cpp mmio.cpp tpmi.cpp pmt.cpp bw.cpp utils.cpp topology.cpp debug.cpp threadpool.cpp uncore_pmu_discovery.cpp) diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp index ef6bdc89..ec16202d 100644 --- a/src/cpucounters.cpp +++ b/src/cpucounters.cpp @@ -552,7 +552,7 @@ bool PCM::L3CacheOccupancyMetricAvailable() const bool PCM::CoreLocalMemoryBWMetricAvailable() const { - if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata + //if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata PCM_CPUID_INFO cpuinfo; if (!(QOSMetricAvailable() && L3QOSMetricAvailable())) return false; @@ -562,7 +562,7 @@ bool PCM::CoreLocalMemoryBWMetricAvailable() const bool PCM::CoreRemoteMemoryBWMetricAvailable() const { - if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata + //if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata PCM_CPUID_INFO cpuinfo; if (!(QOSMetricAvailable() && L3QOSMetricAvailable())) return false; diff --git a/src/pcm-sensor-server.cpp b/src/pcm-sensor-server.cpp index 72a89ec5..70e05dc6 100644 --- a/src/pcm-sensor-server.cpp +++ b/src/pcm-sensor-server.cpp @@ -427,6 +427,10 @@ class JSONPrinter : Visitor PCM* pcm = PCM::getInstance(); printCounter( "DRAM Writes", getBytesWrittenToMC ( before, after ) ); printCounter( "DRAM Reads", getBytesReadFromMC ( before, after ) ); + + if (pcm->localMemoryRequestRatioMetricAvailable()) + printCounter( "DRAM Local Percentage", getLocalMemoryRequestRatio( before, after ) ); + if(pcm->nearMemoryMetricsAvailable()){ printCounter( "NM HitRate", getNMHitRate ( before, after ) ); printCounter( "NM Hits", getNMHits ( before, after ) ); @@ -715,6 +719,10 @@ class PrometheusPrinter : Visitor addToHierarchy( "source=\"uncore\"" ); printCounter( "DRAM Writes", getBytesWrittenToMC ( before, after ) ); printCounter( "DRAM Reads", getBytesReadFromMC ( before, after ) ); + + if (pcm->localMemoryRequestRatioMetricAvailable()) + printCounter( "DRAM Local Percentage", getLocalMemoryRequestRatio( before, after ) ); + if(pcm->nearMemoryMetricsAvailable()){ printCounter( "NM Hits", getNMHits ( before, after ) ); printCounter( "NM Misses", getNMMisses ( before, after ) );