diff --git a/docs/README.md b/docs/README.md index bb325d55..6adcab7f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,6 +17,7 @@ | [Namespaces per cluster](namespaces-per-cluster.md) | Maximum Namespaces | None | | [Services per namespace](services-per-namespace.md) | Maximum services per namespace | None | | [FIO I/O test](fio.md) | FIO I/O test - stress storage backend | Privileged Containers, Working storage class | +| [Taints and Tolerations](taints-tolerations.md) | Taints and Tolerations Test | Privileged Container | * Baseline job without a tooled cluster just idles a cluster. The goal is to capture resource consumption over a period of time to characterize resource requirements thus tooling is required. (For now) @@ -53,3 +54,4 @@ Each workload will implement a form of pass/fail criteria in order to flag if th | [Namespaces per cluster](namespaces-per-cluster.md) | Yes: Exit code, Test Duration | | [Services per namespace](services-per-namespace.md) | Yes: Exit code, Test Duration | | [FIO I/O test](fio.md) | No | +| [Taints and Tolerations](taints-tolerations.md) | Yes: Exit code | diff --git a/docs/taints-tolerations.md b/docs/taints-tolerations.md new file mode 100644 index 00000000..323078f5 --- /dev/null +++ b/docs/taints-tolerations.md @@ -0,0 +1,92 @@ +# Taints and Tolerations Workload + +The Taints and Tolerations workload playbook is `workloads/taints-tolerations.yml` and will run the Taints and Tolerations workload on your cluster. + +The Taints and Tolerations workload's purpose is to validate if the OpenShift cluster can deploy 130 hello-pods with memory and CPU requests per tainted worker node. + +An OCP with 3 masters and 3 worker nodes is required. + +Running from CLI: + +```sh +$ cp workloads/inventory.example inventory +$ # Add orchestration host to inventory +$ # Edit vars in workloads/vars/taints-tolerations.yml or define Environment vars (See below) +$ time ansible-playbook -vv -i inventory workloads/taints-tolerations.yml +``` + +## Environment variables + +### PUBLIC_KEY +Default: `~/.ssh/id_rsa.pub` +Public ssh key file for Ansible. + +### PRIVATE_KEY +Default: `~/.ssh/id_rsa` +Private ssh key file for Ansible. + +### ORCHESTRATION_USER +Default: `root` +User for Ansible to log in as. Must authenticate with PUBLIC_KEY/PRIVATE_KEY. + +### WORKLOAD_IMAGE +Default: `quay.io/openshift-scale/scale-ci-workload` +Container image that runs the workload script. + +### WORKLOAD_JOB_NODE_SELECTOR +Default: `false` +Enables/disables the node selector that places the workload job on the `workload` node. + +### WORKLOAD_JOB_TAINT +Default: `false` +Enables/disables the toleration on the workload job to permit the `workload` taint. + +### WORKLOAD_JOB_PRIVILEGED +Default: `true` +Enables/disables running the workload pod as privileged. + +### KUBECONFIG_FILE +Default: `~/.kube/config` +Location of kubeconfig on orchestration host. + +### PBENCH_INSTRUMENTATION +Default: `false` +Enables/disables running the workload wrapped by pbench-user-benchmark. When enabled, pbench agents can then be enabled (`ENABLE_PBENCH_AGENTS`) for further instrumentation data and pbench-copy-results can be enabled (`ENABLE_PBENCH_COPY`) to export captured data for further analysis. + +### ENABLE_PBENCH_AGENTS +Default: `false` +Enables/disables the collection of pbench data on the pbench agent Pods. These Pods are deployed by the tooling playbook. + +### ENABLE_PBENCH_COPY +Default: `false` +Enables/disables the copying of pbench data to a remote results server for further analysis. + +### PBENCH_SSH_PRIVATE_KEY_FILE +Default: `~/.ssh/id_rsa` +Location of ssh private key to authenticate to the pbench results server. + +### PBENCH_SSH_PUBLIC_KEY_FILE +Default: `~/.ssh/id_rsa.pub` +Location of the ssh public key to authenticate to the pbench results server. + +### PBENCH_SERVER +Default: There is no public default. +DNS address of the pbench results server. + +### SCALE_CI_RESULTS_TOKEN +Default: There is no public default. +Future use for pbench and prometheus scraper to place results into git repo that holds results data. + +### JOB_COMPLETION_POLL_ATTEMPTS +Default: `360` +Number of retries for Ansible to poll if the workload job has completed. Poll attempts delay 10s between polls with some additional time taken for each polling action depending on the orchestration host setup. + +### TAINTS_TOLERATIONS_TEST_PREFIX +Default: `taints-tolerations` +Test to prefix the pbench results. + +## Smoke test variables + +``` +TAINTS_TOLERATIONS_TEST_PREFIX=taints-tolerations_smoke +``` diff --git a/workloads/files/workload-taints-tolerations-script-cm.yml b/workloads/files/workload-taints-tolerations-script-cm.yml new file mode 100644 index 00000000..e4a8a541 --- /dev/null +++ b/workloads/files/workload-taints-tolerations-script-cm.yml @@ -0,0 +1,97 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: scale-ci-workload-script +data: + run.sh: | + #!/bin/sh + set -eo pipefail + workload_log() { echo "$(date -u) $@" >&2; } + export -f workload_log + workload_log "Configuring pbench for Concurrent Scale Up Down" + mkdir -p /var/lib/pbench-agent/tools-default/ + echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd + if [ "${ENABLE_PBENCH_AGENTS}" = true ]; then + echo "" > /var/lib/pbench-agent/tools-default/disk + echo "" > /var/lib/pbench-agent/tools-default/iostat + echo "workload" > /var/lib/pbench-agent/tools-default/label + echo "" > /var/lib/pbench-agent/tools-default/mpstat + echo "" > /var/lib/pbench-agent/tools-default/oc + echo "" > /var/lib/pbench-agent/tools-default/perf + echo "" > /var/lib/pbench-agent/tools-default/pidstat + echo "" > /var/lib/pbench-agent/tools-default/sar + master_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/master= --no-headers | awk '{print $1}'` + for node in $master_nodes; do + echo "master" > /var/lib/pbench-agent/tools-default/remote@$node + done + infra_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/infra= --no-headers | awk '{print $1}'` + for node in $infra_nodes; do + echo "infra" > /var/lib/pbench-agent/tools-default/remote@$node + done + worker_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/worker= --no-headers | awk '{print $1}'` + for node in $worker_nodes; do + echo "worker" > /var/lib/pbench-agent/tools-default/remote@$node + done + fi + source /opt/pbench-agent/profile + workload_log "Done configuring pbench Concurrent Scale Up Down" + + workload_log "Running Concurrent Scale Up Down workload" + if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then + pbench-user-benchmark -- sh /root/workload/workload.sh + result_dir="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1/sample1 + if [ "${ENABLE_PBENCH_COPY}" = "true" ]; then + pbench-copy-results --prefix ${TAINTS_TOLERATIONS_TEST_PREFIX} + fi + else + sh /root/workload/workload.sh + result_dir=/tmp + fi + + workload_log "Completed Taints and Tolerations workload run" + + workload_log "Checking Test Results" + workload_log "Checking script taint_tolerations.sh execution exit code : ${exit_code}" + + if [ "$(jq '.exit_code==0' ${result_dir}/exit.json)" = "false" ]; then + workload_log "Taints and Tolerations Test Failure" + workload_log "Test Analysis: Failed" + exit 1 + fi + # TODO: Check pbench-agent collected metrics for Pass/Fail + # TODO: Check prometheus collected metrics for Pass/Fail + workload_log "Test Analysis: Passed" + + workload.sh: | + #!/bin/sh + set -o pipefail + + result_dir=/tmp + if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then + result_dir=${benchmark_results_dir} + fi + + # git clone svt repo in /root + cd /root + git clone https://github.com/openshift/svt.git + cd svt + git status + cd /root/svt/openshift_scalability/ci/scripts + ls -ltr + + start_time=$(date +%s) + my_time=$(date +%Y-%m-%d-%H%M) + + # run taint_tolerations.sh + ./taint_nodes.sh 2>&1 | tee /tmp/output_taint_nodes-${my_time}.log + + exit_code=$? + end_time=$(date +%s) + duration=$((end_time-start_time)) + workload_log "Test duration was: ${duration}" + + workload_log "Output of script taint_nodes.sh execution: $(cat /tmp/output_taint_nodes-${my_time}.log)" + + workload_log "Writing script taint_nodes.sh execution exit code : ${exit_code}" + jq -n '. | ."exit_code"='${exit_code}' | ."duration"='${duration}'' > "${result_dir}/exit.json" + workload_log "Finished workload script" diff --git a/workloads/taints-toleration.yaml b/workloads/taints-toleration.yaml new file mode 100644 index 00000000..d8a1a3b0 --- /dev/null +++ b/workloads/taints-toleration.yaml @@ -0,0 +1,131 @@ +--- +# +# Runs Taints and Tolerations on OpenShift 4.x cluster +# + +- name: Runs taints-tolerations on a RHCOS OpenShift cluster + hosts: orchestration + gather_facts: true + remote_user: "{{orchestration_user}}" + vars_files: + - vars/taints-tolerations.yml + vars: + workload_job: "taints-tolerations" + tasks: + - name: Create scale-ci-tooling directory + file: + path: "{{ansible_user_dir}}/scale-ci-tooling" + state: directory + + - name: Copy workload files + copy: + src: "{{item.src}}" + dest: "{{item.dest}}" + with_items: + - src: scale-ci-tooling-ns.yml + dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml" + - src: workload-taints-tolerations-script-cm.yml + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-taints-tolerations-script-cm.yml" + + - name: Slurp kubeconfig file + slurp: + src: "{{kubeconfig_file}}" + register: kubeconfig_file_slurp + + - name: Slurp ssh private key file + slurp: + src: "{{pbench_ssh_private_key_file}}" + register: pbench_ssh_private_key_file_slurp + + - name: Slurp ssh public key file + slurp: + src: "{{pbench_ssh_public_key_file}}" + register: pbench_ssh_public_key_file_slurp + + - name: Template workload templates + template: + src: "{{item.src}}" + dest: "{{item.dest}}" + with_items: + - src: pbench-cm.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/pbench-cm.yml" + - src: pbench-ssh-secret.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/pbench-ssh-secret.yml" + - src: kubeconfig-secret.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/kubeconfig-secret.yml" + - src: workload-job.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml" + - src: workload-env.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-taints-tolerations-env.yml" + + - name: Check if scale-ci-tooling namespace exists + shell: | + oc get project scale-ci-tooling + ignore_errors: true + changed_when: false + register: scale_ci_tooling_ns_exists + + - name: Ensure any stale scale-ci-taints-tolerations job is deleted + shell: | + oc delete job scale-ci-taints-tolerations -n scale-ci-tooling + register: scale_ci_tooling_project + failed_when: scale_ci_tooling_project.rc == 0 + until: scale_ci_tooling_project.rc == 1 + retries: 60 + delay: 1 + when: scale_ci_tooling_ns_exists.rc == 0 + + - name: Block for non-existing tooling namespace + block: + - name: Create tooling namespace + shell: | + oc create -f {{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml + + - name: Create tooling service account + shell: | + oc create serviceaccount useroot -n scale-ci-tooling + oc adm policy add-scc-to-user privileged -z useroot -n scale-ci-tooling + when: enable_pbench_agents|bool + when: scale_ci_tooling_ns_exists.rc != 0 + + - name: Create/replace kubeconfig secret + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/kubeconfig-secret.yml" + + - name: Create/replace the pbench configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/pbench-cm.yml" + + - name: Create/replace pbench ssh secret + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/pbench-ssh-secret.yml" + + - name: Create/replace workload script configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-taints-tolerations-script-cm.yml" + + - name: Create/replace workload script environment configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-taints-tolerations-env.yml" + + - name: Create/replace workload job to that runs workload script + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml" + + - name: Poll until job pod is running + shell: | + oc get pods --selector=job-name=scale-ci-taints-tolerations -n scale-ci-tooling -o json + register: pod_json + retries: 60 + delay: 2 + until: pod_json.stdout | from_json | json_query('items[0].status.phase==`Running`') + + - name: Poll until job is complete + shell: | + oc get job scale-ci-taints-tolerations -n scale-ci-tooling -o json + register: job_json + retries: "{{job_completion_poll_attempts}}" + delay: 10 + until: job_json.stdout | from_json | json_query('status.succeeded==`1` || status.failed==`1`') + failed_when: job_json.stdout | from_json | json_query('status.succeeded==`1`') == false + when: job_completion_poll_attempts|int > 0 diff --git a/workloads/templates/workload-env.yml.j2 b/workloads/templates/workload-env.yml.j2 index 40b25d18..c20d656c 100644 --- a/workloads/templates/workload-env.yml.j2 +++ b/workloads/templates/workload-env.yml.j2 @@ -103,4 +103,8 @@ data: PROMETHEUS_GRAPH_PERIOD: "{{prometheus_graph_period}}" PROMETHEUS_REFRESH_INTERVAL: "{{prometheus_refresh_interval}}" PROMETHEUS_SCALE_TEST_PREFIX: "{{prometheus_scale_test_prefix}}" +{% elif workload_job == "taints-tolerations" %} + PBENCH_INSTRUMENTATION: "{{pbench_instrumentation|bool|lower}}" + ENABLE_PBENCH_COPY: "{{enable_pbench_copy|bool|lower}}" + TAINTS_TOLERATIONS_TEST_PREFIX: "{{taints_tolerations_test_prefix}}" {% endif %} diff --git a/workloads/vars/taints-tolerations.yml b/workloads/vars/taints-tolerations.yml new file mode 100644 index 00000000..c4dd26b2 --- /dev/null +++ b/workloads/vars/taints-tolerations.yml @@ -0,0 +1,33 @@ +--- +############################################################################### +# Ansible SSH variables. +############################################################################### +ansible_public_key_file: "{{ lookup('env', 'PUBLIC_KEY')|default('~/.ssh/id_rsa.pub', true) }}" +ansible_private_key_file: "{{ lookup('env', 'PRIVATE_KEY')|default('~/.ssh/id_rsa', true) }}" + +orchestration_user: "{{ lookup('env', 'ORCHESTRATION_USER')|default('root', true) }}" +############################################################################### +# NodeVertical workload variables. +############################################################################### +workload_image: "{{ lookup('env', 'WORKLOAD_IMAGE')|default('quay.io/openshift-scale/scale-ci-workload', true) }}" + +workload_job_node_selector: "{{ lookup('env', 'WORKLOAD_JOB_NODE_SELECTOR')|default(false, true)|bool }}" +workload_job_taint: "{{ lookup('env', 'WORKLOAD_JOB_TAINT')|default(false, true)|bool }}" +workload_job_privileged: "{{ lookup('env', 'WORKLOAD_JOB_PRIVILEGED')|default(true, true)|bool }}" + +kubeconfig_file: "{{ lookup('env', 'KUBECONFIG_FILE')|default('~/.kube/config', true) }}" + +# pbench variables +pbench_instrumentation: "{{ lookup('env', 'PBENCH_INSTRUMENTATION')|default(false, true)|bool|lower }}" +enable_pbench_agents: "{{ lookup('env', 'ENABLE_PBENCH_AGENTS')|default(false, true)|bool }}" +enable_pbench_copy: "{{ lookup('env', 'ENABLE_PBENCH_COPY')|default(false, true)|bool|lower }}" +pbench_ssh_private_key_file: "{{ lookup('env', 'PBENCH_SSH_PRIVATE_KEY_FILE')|default('~/.ssh/id_rsa', true) }}" +pbench_ssh_public_key_file: "{{ lookup('env', 'PBENCH_SSH_PUBLIC_KEY_FILE')|default('~/.ssh/id_rsa.pub', true) }}" +pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" + +# Other variables for workload tests +scale_ci_results_token: "{{ lookup('env', 'SCALE_CI_RESULTS_TOKEN')|default('', true) }}" +job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(3600, true)|int }}" + +# taints-tolerations workload specific parameters: +taints_tolerations_test_prefix: "{{ lookup('env', 'TAINTS_TOLERATIONS_TEST_PREFIX')|default('taints-tolerations', true) }}"