diff --git a/docs/mastervertical.md b/docs/mastervertical.md index 2c3c855c..24c3d783 100644 --- a/docs/mastervertical.md +++ b/docs/mastervertical.md @@ -127,6 +127,24 @@ Basename used by cluster loader for the project(s) it creates. Default: `1000` Maximum number of projects that will be created by the mastervertical workload. Typically much higher values are used than the default for large scale tests. +### PPROF_COLLECT +Default: `false` +If you'd like to enable pprof profile data collection of kubeapiserver and prometheus through conprof(https://github.com/conprof/conprof). +Enabling this will create a few services to collect profiles from the apiserver pods and then create a conprof tarball in the pbench tarball + +### PROM_AGGREGATE_COLLECT +Default: `false` +If you'd like to enable collection of prometheus data aggregation of kubeapiserver through touchstone(https://github.com/cloud-bulldozer/touchstone). +Enabling this will create a few services to produce prometheus data aggregations from the apiserver pods + +### TOUCHSTONE_ES_HOST +Default: `` +Elasticsearch server host, set to index results from touchstone(prometheus metric data aggregations). + +### TOUCHSTONE_ES_PORT +Default: `` +Elasticsearch server port, set to index results from touchstone(prometheus metric data aggregations). + ### EXPECTED_MASTERVERTICAL_DURATION Default: `600` Pass/fail criteria. Value to determine if MasterVertical workload executed in duration expected. diff --git a/docs/nodevertical.md b/docs/nodevertical.md index ea2ae65b..d0d13418 100644 --- a/docs/nodevertical.md +++ b/docs/nodevertical.md @@ -175,6 +175,19 @@ Default: `false` If you'd like to enable pprof profile data collection of kubeapiserver and prometheus through conprof(https://github.com/conprof/conprof). Enabling this will create a few services to collect profiles from the apiserver pods and then create a conprof tarball in the pbench tarball +### PROM_AGGREGATE_COLLECT +Default: `false` +If you'd like to enable collection of prometheus data aggregation of kubeapiserver through touchstone(https://github.com/cloud-bulldozer/touchstone). +Enabling this will create a few services to produce prometheus data aggregations from the apiserver pods. + +### TOUCHSTONE_ES_HOST +Default: `` +Elasticsearch server host, set to index results from touchstone(prometheus metric data aggregations). + +### TOUCHSTONE_ES_PORT +Default: `` +Elasticsearch server port, set to index results from touchstone(prometheus metric data aggregations). + ### NODEVERTICAL_HEAVY_PROBE_PERIOD Default: `30` Readiness probe period for the application deployed by the heavy nodevertical. diff --git a/docs/prometheus-scale.md b/docs/prometheus-scale.md index 958dd9cf..427a46d0 100644 --- a/docs/prometheus-scale.md +++ b/docs/prometheus-scale.md @@ -98,3 +98,9 @@ Sleep interval for each block iteration in seconds. ### PROMETHEUS_SCALE_TEST_PREFIX Default: `prometheus-scale` Sets the pbench result test prefix. + +### PPROF_COLLECT +Default: `false` +If you'd like to enable pprof profile data collection of kubeapiserver and prometheus through conprof(https://github.com/conprof/conprof). +Enabling this will create a few services to collect profiles from the apiserver pods and then create a conprof tarball in the pbench tarball + diff --git a/workloads/mastervertical.yml b/workloads/mastervertical.yml index d124031f..4a92f7f4 100644 --- a/workloads/mastervertical.yml +++ b/workloads/mastervertical.yml @@ -40,18 +40,19 @@ src: "{{pbench_ssh_public_key_file}}" register: pbench_ssh_public_key_file_slurp - - name: Block to set clustername - block: - - name: Get cluster name - shell: | - {%raw%}oc get machineset -n openshift-machine-api -o=go-template='{{index (index .items 0).metadata.labels "machine.openshift.io/cluster-api-cluster"}}'{%endraw%} - register: cluster_name + - name: Set cluster details + include_role: + name: cluster_details - - name: Create tooling service account - set_fact: - snafu_cluster_name: cluster_name.stdout - when: cluster_name is succeeded - when: snafu_cluster_name == "" + - name: Collect pprof + include_role: + name: pprof-collection + when: pprof_collect and pprof_collect != "" + + - name: Get Prometheus authorizations + include_role: + name: prometheus_metric_aggregation + when: prom_aggregate_collect and prom_aggregate_collect != "" - name: Template workload templates template: diff --git a/workloads/nodevertical.yml b/workloads/nodevertical.yml index e2184c35..093b96bb 100644 --- a/workloads/nodevertical.yml +++ b/workloads/nodevertical.yml @@ -85,6 +85,11 @@ name: pprof-collection when: pprof_collect and pprof_collect != "" + - name: Get Prometheus authorizations + include_role: + name: prometheus_metric_aggregation + when: prom_aggregate_collect and prom_aggregate_collect != "" + - name: Set NodeVertical template set_fact: nodevertical_template: "{% if nodevertical_heavy|bool %}workload-nodevertical-heavy-script-cm.yml.j2{% else %}workload-nodevertical-script-cm.yml.j2{% endif %}" diff --git a/workloads/prometheus.yml b/workloads/prometheus.yml index f5a8f17f..f79eaffa 100644 --- a/workloads/prometheus.yml +++ b/workloads/prometheus.yml @@ -24,8 +24,6 @@ with_items: - src: scale-ci-tooling-ns.yml dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml" - - src: workload-prometheus-script-cm.yml - dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml" - name: Slurp kubeconfig file slurp: @@ -42,6 +40,15 @@ src: "{{pbench_ssh_public_key_file}}" register: pbench_ssh_public_key_file_slurp + - name: Set cluster details + include_role: + name: cluster_details + + - name: Collect pprof + include_role: + name: pprof-collection + when: pprof_collect and pprof_collect != "" + - name: Template workload templates template: src: "{{item.src}}" @@ -58,6 +65,8 @@ dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml" - src: workload-env.yml.j2 dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-env.yml" + - src: workload-prometheus-script-cm.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml" - name: Check if scale-ci-tooling namespace exists shell: | diff --git a/workloads/roles/prometheus_metric_aggregation/tasks/main.yml b/workloads/roles/prometheus_metric_aggregation/tasks/main.yml new file mode 100644 index 00000000..26c9831f --- /dev/null +++ b/workloads/roles/prometheus_metric_aggregation/tasks/main.yml @@ -0,0 +1,16 @@ +- name: check sa for prom-server + shell: "oc get sa -n openshift-kube-apiserver | grep prom-server | wc -l" + register: prom_server_sa + +- name: create sa to access prom_server profiles + block: + - name: create sa + shell: "oc -n openshift-kube-apiserver create sa prom-server" + + - name: add cluster-admin clusterrrole + shell: "oc create clusterrolebinding prom-admin --clusterrole cluster-admin --serviceaccount=openshift-kube-apiserver:prom-server" + when: prom_server_sa.stdout | int == 0 + +- name: get the bearer token + shell: "oc -n openshift-kube-apiserver sa get-token prom-server" + register: prom_bearer_token diff --git a/workloads/templates/workload-deployments-per-ns-script-cm.yml.j2 b/workloads/templates/workload-deployments-per-ns-script-cm.yml.j2 index dca9a86c..622a8173 100644 --- a/workloads/templates/workload-deployments-per-ns-script-cm.yml.j2 +++ b/workloads/templates/workload-deployments-per-ns-script-cm.yml.j2 @@ -51,7 +51,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - pbench-user-benchmark -- 'VIPERCONFIG=/root/workload/cluster-limits-deployments-per-namespace.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' + pbench-user-benchmark -- 'VIPERCONFIG=/root/workload/cluster-limits-deployments-per-namespace.yaml run_snafu -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' pbench-copy-results --prefix {{deployments_per_ns_test_prefix}} echo "$(date -u) Completed running Deployments per ns cluster limits test" # End of Test Code diff --git a/workloads/templates/workload-env.yml.j2 b/workloads/templates/workload-env.yml.j2 index 994f16bc..f055720e 100644 --- a/workloads/templates/workload-env.yml.j2 +++ b/workloads/templates/workload-env.yml.j2 @@ -5,6 +5,7 @@ metadata: data: ENABLE_PBENCH_AGENTS: "{{enable_pbench_agents|bool|lower}}" PPROF_COLLECT: "{{ ((pprof_collect == None) | ternary(false, pprof_collect)) if pprof_collect is defined else false}}" + PROM_AGGREGATE_COLLECT: "{{ ((prom_aggregate_collect == None) | ternary(false, prom_aggregate_collect)) if prom_aggregate_collect is defined else false}}" {% if workload_job == "http" %} {% for v in http_env_vars %} {{ v }}: "{{ lookup('env', v) }}" @@ -18,6 +19,8 @@ data: MASTERVERTICAL_PROJECTS: "{{mastervertical_projects}}" EXPECTED_MASTERVERTICAL_DURATION: "{{expected_mastervertical_duration}}" AZURE_AUTH: "{{azure_auth|bool|lower}}" + TOUCHSTONE_ES_HOST: "{{ touchstone_es_host }}" + TOUCHSTONE_ES_PORT: "{{ touchstone_es_port }}" {% elif workload_job == "network" %} NETWORK_TEST_UPERF_IMAGE: "{{network_test_uperf_image}}" NETWORK_TEST_UPERF_SSHD_PORT: "{{network_test_uperf_sshd_port}}" @@ -54,6 +57,8 @@ data: AZURE_AUTH: "{{azure_auth|bool|lower}}" NODEVERTICAL_HEAVY_PROBE_ENDPOINT: "{{ nodevertical_heavy_probe_endpoint }}" NODEVERTICAL_HEAVY_PROBE_PERIOD: "{{ nodevertical_heavy_probe_period }}" + TOUCHSTONE_ES_HOST: "{{ touchstone_es_host }}" + TOUCHSTONE_ES_PORT: "{{ touchstone_es_port }}" {% elif workload_job == "podvertical" %} PBENCH_INSTRUMENTATION: "{{pbench_instrumentation|bool|lower}}" ENABLE_PBENCH_COPY: "{{enable_pbench_copy|bool|lower}}" diff --git a/workloads/templates/workload-fio-script-cm.yml.j2 b/workloads/templates/workload-fio-script-cm.yml.j2 index d44c1afa..b61b816c 100644 --- a/workloads/templates/workload-fio-script-cm.yml.j2 +++ b/workloads/templates/workload-fio-script-cm.yml.j2 @@ -56,7 +56,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - pbench-user-benchmark --config="{{ fiotest_prefix }}-pods-{{ fiotest_maxpods }}-sc-{{ fiotest_storageclass }}-create_pods-{{ fiotest_description }}" -- 'VIPERCONFIG=/root/workload/fiotest.yml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' + pbench-user-benchmark --config="{{ fiotest_prefix }}-pods-{{ fiotest_maxpods }}-sc-{{ fiotest_storageclass }}-create_pods-{{ fiotest_description }}" -- 'VIPERCONFIG=/root/workload/fiotest.yml run_snafu -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' echo "$(date -u) Pods for FIO I/O test created." # wait until all pods are started and then collect data diff --git a/workloads/templates/workload-mastervertical-script-cm.yml.j2 b/workloads/templates/workload-mastervertical-script-cm.yml.j2 index 87a6a678..6283e3e6 100644 --- a/workloads/templates/workload-mastervertical-script-cm.yml.j2 +++ b/workloads/templates/workload-mastervertical-script-cm.yml.j2 @@ -40,6 +40,14 @@ data: fi workload_log "Done configuring pbench for MasterVertical" + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Configuring conprof" + envsubst < /root/workload/conprof.yaml.template > /tmp/conprof.yaml + envsubst < /root/workload/conprof_start.sh > /tmp/conprof_start.sh + envsubst < /root/workload/conprof_stop.sh > /tmp/conprof_stop.sh + workload_log "Done configuring conprof" + fi + workload_log "Configuring MasterVertical test" envsubst < /root/workload/mastervertical.yaml.template > /tmp/mastervertical.yaml workload_log "Done configuring MasterVertical test" @@ -75,28 +83,63 @@ data: # TODO: Check pbench-agent collected metrics for Pass/Fail # TODO: Check prometheus collected metrics for Pass/Fail workload_log "Test Analysis: Passed" + + conprof_start.sh: | + #!/bin/sh + set -o pipefail + nohup /usr/bin/conprof all --config.file /tmp/conprof.yaml --log.level=debug --storage.tsdb.path=/tmp/data &>/tmp/conprof.log & + conprof_stop.sh: | + #!/bin/sh + set -o pipefail + pkill conprof + prom_aggregation_start.sh: | + #!/bin/sh + set -o pipefail + touchstone_compare -database prometheus -v -prom_config /tmp/prom_config.yaml workload.sh: | #!/bin/sh set -o pipefail result_dir=/tmp + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Starting conprof" + bash /tmp/conprof_start.sh + fi + if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then result_dir=${benchmark_results_dir} fi if [[ "${AZURE_AUTH}" == "true" ]]; then export AZURE_AUTH_LOCATION=/tmp/azure_auth fi - start_time=$(date +%s) + export start_time=$(date +%s) export cluster_name={{ snafu_cluster_name }} export test_user={{ snafu_user }} export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/mastervertical.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/mastervertical.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? - end_time=$(date +%s) + export end_time=$(date +%s) duration=$((end_time-start_time)) + if [ "${PROM_AGGREGATE_COLLECT}" = "true" ]; then + workload_log "Configuring touchstone" + envsubst < /root/workload/prom_config.yaml.template > /tmp/prom_config.yaml + envsubst < /root/workload/prom_aggregation_start.sh > /tmp/prom_aggregation_start.sh + workload_log "Starting touchstone" + bash /tmp/prom_aggregation_start.sh + workload_log "Completed prometheus data aggregations and stored logs" + fi + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Stopping conprof" + bash /tmp/conprof_stop.sh + cp /tmp/conprof.log ${result_dir}/conprof.log + cp /tmp/conprof.yaml ${result_dir}/conprof.yaml + tar -czvf ${result_dir}/conprof.tar.gz /tmp/data/ + workload_log "copied conprof tarballs and log" + fi + workload_log "Writing Cluster Loader Exit Code" jq -n '. | ."exit_code"='${exit_code}' | ."duration"='${duration}'' > "${result_dir}/exit.json" workload_log "Writing Cluster Loader Metrics to clusterloader.json" @@ -560,3 +603,125 @@ data: required: true labels: template: routeTemplate + prom_config.yaml.template: | + --- + - url: https://prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}} + query_list: + - sum(container_memory_rss{namespace=~"openshift-kube-apiserver",name!="",container=~"kube-apiserver.*"}) by (container) + bearer_token: {{prom_bearer_token.stdout}} + disable_ssl: True + start_time_list: + - $start_time + end_time_list: + - $end_time + index_result_to_es: True + conprof.yaml.template: | + scrape_configs: + - job_name: 'apiserver0' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver0-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'apiserver1' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver1-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'apiserver2' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver2-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'prometheus' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: false + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false diff --git a/workloads/templates/workload-namespaces-per-cluster-script-cm.yml.j2 b/workloads/templates/workload-namespaces-per-cluster-script-cm.yml.j2 index 537f7248..57647299 100644 --- a/workloads/templates/workload-namespaces-per-cluster-script-cm.yml.j2 +++ b/workloads/templates/workload-namespaces-per-cluster-script-cm.yml.j2 @@ -92,7 +92,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/namespaces_per_cluster.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/namespaces_per_cluster.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? end_time=$(date +%s) duration=$((end_time-start_time)) diff --git a/workloads/templates/workload-network-script-cm.yml.j2 b/workloads/templates/workload-network-script-cm.yml.j2 index d7edf7aa..8bd62337 100644 --- a/workloads/templates/workload-network-script-cm.yml.j2 +++ b/workloads/templates/workload-network-script-cm.yml.j2 @@ -77,7 +77,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/network.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests + VIPERCONFIG=/tmp/network.yaml python3 run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests # Check if clients and servers are ready for rc_num in `seq 0 $((${pair_count} - 1))` diff --git a/workloads/templates/workload-nodevertical-heavy-script-cm.yml.j2 b/workloads/templates/workload-nodevertical-heavy-script-cm.yml.j2 index a91ccce0..ec1817b9 100644 --- a/workloads/templates/workload-nodevertical-heavy-script-cm.yml.j2 +++ b/workloads/templates/workload-nodevertical-heavy-script-cm.yml.j2 @@ -90,7 +90,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/nodevertical-heavy.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/nodevertical-heavy.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? end_time=$(date +%s) duration=$((end_time-start_time)) diff --git a/workloads/templates/workload-nodevertical-script-cm.yml.j2 b/workloads/templates/workload-nodevertical-script-cm.yml.j2 index 2039e0a2..f22b31be 100644 --- a/workloads/templates/workload-nodevertical-script-cm.yml.j2 +++ b/workloads/templates/workload-nodevertical-script-cm.yml.j2 @@ -90,10 +90,13 @@ data: #!/bin/sh set -o pipefail pkill conprof + prom_aggregation_start.sh: | + #!/bin/sh + set -o pipefail + touchstone_compare -database prometheus -v -prom_config /tmp/prom_config.yaml workload.sh: | #!/bin/sh set -o pipefail - result_dir=/tmp if [ "${PPROF_COLLECT}" = "true" ]; then workload_log "Starting conprof" @@ -102,7 +105,7 @@ data: if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then result_dir=${benchmark_results_dir} fi - start_time=$(date +%s) + export start_time=$(date +%s) if [[ "${AZURE_AUTH}" == "true" ]]; then export AZURE_AUTH_LOCATION=/tmp/azure_auth fi @@ -111,11 +114,18 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/nodevertical.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/nodevertical.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? - end_time=$(date +%s) + export end_time=$(date +%s) duration=$((end_time-start_time)) - + if [ "${PROM_AGGREGATE_COLLECT}" = "true" ]; then + workload_log "Configuring touchstone" + envsubst < /root/workload/prom_config.yaml.template > /tmp/prom_config.yaml + envsubst < /root/workload/prom_aggregation_start.sh > /tmp/prom_aggregation_start.sh + workload_log "Starting touchstone" + bash /tmp/prom_aggregation_start.sh + workload_log "Completed prometheus data aggregations and stored logs" + fi if [ "${PPROF_COLLECT}" = "true" ]; then workload_log "Stopping conprof" bash /tmp/conprof_stop.sh @@ -173,6 +183,18 @@ data: privileged: false nodeSelector: nodevertical: 'true' + prom_config.yaml.template: | + --- + - url: https://prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}} + query_list: + - sum(container_memory_rss{namespace=~"openshift-kube-apiserver",name!="",container=~"kube-apiserver.*"}) by (container) + bearer_token: {{prom_bearer_token.stdout}} + disable_ssl: True + start_time_list: + - $start_time + end_time_list: + - $end_time + index_result_to_es: True conprof.yaml.template: | scrape_configs: - job_name: 'apiserver0' @@ -198,8 +220,6 @@ data: enabled: false block: enabled: false - cmdline: - enabled: false mutex: enabled: false trace: @@ -227,8 +247,6 @@ data: enabled: false block: enabled: false - cmdline: - enabled: false mutex: enabled: false trace: @@ -256,8 +274,6 @@ data: enabled: false block: enabled: false - cmdline: - enabled: false mutex: enabled: false trace: @@ -285,8 +301,6 @@ data: enabled: false block: enabled: false - cmdline: - enabled: false mutex: enabled: false trace: diff --git a/workloads/templates/workload-podvertical-script-cm.yml.j2 b/workloads/templates/workload-podvertical-script-cm.yml.j2 index 0f7da5a0..05fef457 100644 --- a/workloads/templates/workload-podvertical-script-cm.yml.j2 +++ b/workloads/templates/workload-podvertical-script-cm.yml.j2 @@ -93,7 +93,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/podvertical.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/podvertical.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? end_time=$(date +%s) duration=$((end_time-start_time)) diff --git a/workloads/files/workload-prometheus-script-cm.yml b/workloads/templates/workload-prometheus-script-cm.yml.j2 similarity index 81% rename from workloads/files/workload-prometheus-script-cm.yml rename to workloads/templates/workload-prometheus-script-cm.yml.j2 index 03ce3e1d..4b6b301b 100644 --- a/workloads/files/workload-prometheus-script-cm.yml +++ b/workloads/templates/workload-prometheus-script-cm.yml.j2 @@ -38,6 +38,14 @@ data: fi workload_log "Done configuring pbench for Prometheus scale" + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Configuring conprof" + envsubst < /root/workload/conprof.yaml.template > /tmp/conprof.yaml + envsubst < /root/workload/conprof_start.sh > /tmp/conprof_start.sh + envsubst < /root/workload/conprof_stop.sh > /tmp/conprof_stop.sh + workload_log "Done configuring conprof" + fi + workload_log "Running Prometheus scale workload" if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then pbench-user-benchmark --pbench-post='sh /root/workload/post-run.sh' -- sh /root/workload/workload.sh @@ -53,10 +61,25 @@ data: RESULT_DIR=/tmp fi workload_log "Completed Prometheus scale workload run" + + conprof_start.sh: | + #!/bin/sh + set -o pipefail + nohup /usr/bin/conprof all --config.file /tmp/conprof.yaml --log.level=debug --storage.tsdb.path=/tmp/data &>/tmp/conprof.log & + conprof_stop.sh: | + #!/bin/sh + set -o pipefail + pkill conprof + workload.sh: | #!/bin/sh set -ox pipefail + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Starting conprof" + bash /tmp/conprof_start.sh + fi + db_aging() { while true; do echo "$(date +'%m-%d-%y-%H:%M:%S') $(oc exec prometheus-k8s-0 -n openshift-monitoring -c prometheus -- df |grep /prometheus$)" >> /tmp/pvc_monitor_0.log @@ -80,13 +103,22 @@ data: # stop the prometheus load kill -9 ${loader_pid} ${db_aging_pid} + if [ "${PPROF_COLLECT}" = "true" ]; then + workload_log "Stopping conprof" + bash /tmp/conprof_stop.sh + cp /tmp/conprof.log ${benchmark_results_dir}/conprof.log + cp /tmp/conprof.yaml ${benchmark_results_dir}/conprof.yaml + tar -czvf ${benchmark_results_dir}/conprof.tar.gz /tmp/data/ + workload_log "copied conprof tarballs and log" + fi + # test idle sleep 300 post-run.sh: | #!/bin/sh set -ox pipefail - RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1/sample1 + RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1-default/sample1 echo "Using RESULT_DIR of: \"${RESULT_DIR}\"" oc logs -n openshift-monitoring prometheus-k8s-0 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_1.log oc logs -n openshift-monitoring prometheus-k8s-1 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_2.log @@ -463,3 +495,113 @@ data: def get_dashboards(self): return self.dashboards + conprof.yaml.template: | + scrape_configs: + - job_name: 'apiserver0' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver0-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'apiserver1' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver1-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'apiserver2' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['apiserver2-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: true + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false + - job_name: 'prometheus' + scrape_interval: 30s + scrape_timeout: 10m + scheme: https + tls_config: + insecure_skip_verify: true + static_configs: + - targets: ['prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}}'] + bearer_token: {{bearer_token.stdout}} + profiling_config: + pprof_config: + heap: + enabled: true + profile: + enabled: false + goroutine: + enabled: false + threadcreate: + enabled: false + allocs: + enabled: false + block: + enabled: false + mutex: + enabled: false + trace: + enabled: false diff --git a/workloads/templates/workload-pvcscale-script-cm.yml.j2 b/workloads/templates/workload-pvcscale-script-cm.yml.j2 index 7da1c797..f2ca1920 100644 --- a/workloads/templates/workload-pvcscale-script-cm.yml.j2 +++ b/workloads/templates/workload-pvcscale-script-cm.yml.j2 @@ -51,7 +51,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - pbench-user-benchmark --config="{{ pvcscale_test_prefix }}-pods-{{ pvcscale_maxpods }}-sc-{{ pvcscale_storageclass }}-create_pods" -- 'VIPERCONFIG=/root/workload/pvcscale.yml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' + pbench-user-benchmark --config="{{ pvcscale_test_prefix }}-pods-{{ pvcscale_maxpods }}-sc-{{ pvcscale_storageclass }}-create_pods" -- 'VIPERCONFIG=/root/workload/pvcscale.yml run_snafu -t cl scale-ci --cl-output True --dir /tmp/snafu_results -p openshift-tests' echo "$(date -u) Pods/PVC are crated ..." # End Test Configuration diff --git a/workloads/templates/workload-services-per-namespace-script-cm.yml.j2 b/workloads/templates/workload-services-per-namespace-script-cm.yml.j2 index 5b5df720..4446a3da 100644 --- a/workloads/templates/workload-services-per-namespace-script-cm.yml.j2 +++ b/workloads/templates/workload-services-per-namespace-script-cm.yml.j2 @@ -92,7 +92,7 @@ data: export es={{ snafu_es_host }} export es_port={{ snafu_es_port }} export es_index={{ snafu_es_index_prefix }} - VIPERCONFIG=/tmp/services_per_namespace.yaml python3 /tmp/snafu/run_snafu.py -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" + VIPERCONFIG=/tmp/services_per_namespace.yaml run_snafu -t cl scale-ci --cl-output True --dir "${result_dir}" -p openshift-tests | tee "${result_dir}/clusterloader.txt" exit_code=$? end_time=$(date +%s) duration=$((end_time-start_time)) diff --git a/workloads/vars/mastervertical.yml b/workloads/vars/mastervertical.yml index 197d9953..a269465a 100644 --- a/workloads/vars/mastervertical.yml +++ b/workloads/vars/mastervertical.yml @@ -32,6 +32,14 @@ pbench_ssh_private_key_file: "{{ lookup('env', 'PBENCH_SSH_PRIVATE_KEY_FILE')|de pbench_ssh_public_key_file: "{{ lookup('env', 'PBENCH_SSH_PUBLIC_KEY_FILE')|default('~/.ssh/id_rsa.pub', true) }}" pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" +# pporf variables +pprof_collect: "{{ lookup('env', 'PPROF_COLLECT')|default(false, true)|bool|lower }}" + +# touchstone variables +prom_aggregate_collect: "{{ lookup('env', 'PROM_AGGREGATE_COLLECT')|default(false, true)|bool|lower }}" +touchstone_es_host: "{{ lookup('env', 'TOUCHSTONE_ES_HOST')|default('', true) }}" +touchstone_es_port: "{{ lookup('env', 'TOUCHSTONE_ES_PORT')|default('', true) }}" + # Azure auth vars to set for ocp on azure azure_auth: "{{ lookup('env', 'AZURE_AUTH')|default(false, true)|bool|lower }}" azure_auth_file: "{{ lookup('env', 'AZURE_AUTH_FILE')|default('', true) }}" diff --git a/workloads/vars/nodevertical.yml b/workloads/vars/nodevertical.yml index d826d7c2..48ec7f7c 100644 --- a/workloads/vars/nodevertical.yml +++ b/workloads/vars/nodevertical.yml @@ -35,6 +35,11 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" # pporf variables pprof_collect: "{{ lookup('env', 'PPROF_COLLECT')|default(false, true)|bool|lower }}" +# touchstone variables +prom_aggregate_collect: "{{ lookup('env', 'PROM_AGGREGATE_COLLECT')|default(false, true)|bool|lower }}" +touchstone_es_host: "{{ lookup('env', 'TOUCHSTONE_ES_HOST')|default('', true) }}" +touchstone_es_port: "{{ lookup('env', 'TOUCHSTONE_ES_PORT')|default('', true) }}" + # Azure auth vars to set for ocp on azure azure_auth: "{{ lookup('env', 'AZURE_AUTH')|default(false, true)|bool|lower }}" azure_auth_file: "{{ lookup('env', 'AZURE_AUTH_FILE')|default('', true) }}" diff --git a/workloads/vars/prometheus.yml b/workloads/vars/prometheus.yml index 59b81951..0aa0b596 100644 --- a/workloads/vars/prometheus.yml +++ b/workloads/vars/prometheus.yml @@ -29,6 +29,9 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" scale_ci_results_token: "{{ lookup('env', 'SCALE_CI_RESULTS_TOKEN')|default('', true) }}" job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}" +# pporf variables +pprof_collect: "{{ lookup('env', 'PPROF_COLLECT')|default(false, true)|bool|lower }}" + # Prometheus scale workload specific parameters: prometheus_scale_test_prefix: "{{ lookup('env', 'PROMETHEUS_SCALE_TEST_PREFIX')|default('prometheus-scale', true) }}" prometheus_concurrency: "{{ lookup('env', 'PROMETHEUS_CONCURRENCY')|default(10, true)|int }}"