diff --git a/.env b/.env index 4c56cf6aba8..718ed47e391 100644 --- a/.env +++ b/.env @@ -1,6 +1,6 @@ # Globals -VERSION="1.4.4" -CEPH_VERSION="19.2.1" +VERSION="1.4.23" +CEPH_VERSION="19.2.3" SPDK_VERSION="24.09" CONTAINER_REGISTRY="quay.io/ceph" QUAY_SPDK="${CONTAINER_REGISTRY}/spdk" @@ -61,17 +61,17 @@ SPDK_CONFIGURE_ARGS="--with-rbd --disable-tests --disable-unit-tests --disable-e SPDK_TARGET_ARCH="x86-64-v2" SPDK_MAKEFLAGS= SPDK_CENTOS_BASE="https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/" -SPDK_CENTOS_REPO_VER="9.0-21.el9" +SPDK_CENTOS_REPO_VER="9.0-28.el9" # Ceph Cluster CEPH_CLUSTER_VERSION="${CEPH_VERSION}" -CEPH_BRANCH=main -CEPH_SHA=latest +CEPH_BRANCH=wip-leonidc1911-gw-del-delay-blklst +CEPH_SHA=7cb09ba8ee2c559ea82878449296ac5e8f356be4 CEPH_DEVEL_MGR_PATH=../ceph # Atom -ATOM_SHA=1542fc2867c570c7af142a379b512394623dcbe4 +ATOM_SHA=1fea0085a1853d0b75bd9d34c28a4caebc0c1e68 # Demo settings RBD_POOL=rbd diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 51534b8732c..c7dd48e0734 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -9,9 +9,7 @@ on: # yamllint disable rule:truthy - '*' pull_request: branches: - - devel - schedule: - - cron: '0 0 * * *' + - '*' workflow_dispatch: release: types: @@ -100,54 +98,12 @@ jobs: path: | ceph.tar - build-arm64: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Build container images - spdk - run: make build SVC=spdk TARGET_ARCH=arm64 - - - name: Build container images - bdevperf - run: make build SVC=bdevperf TARGET_ARCH=arm64 - - - name: Build container images - nvmeof - run: make build SVC=nvmeof TARGET_ARCH=arm64 - - - name: Build container images - nvmeof-cli - run: make build SVC=nvmeof-cli TARGET_ARCH=arm64 - - - name: Build container images - ceph - run: make build SVC=ceph TARGET_ARCH=arm64 - - - name: Save container images - run: | - . .env - docker tag $QUAY_NVMEOF:$NVMEOF_VERSION $QUAY_NVMEOF:$NVMEOF_VERSION-arm64 - docker tag $QUAY_NVMEOFCLI:$NVMEOF_VERSION $QUAY_NVMEOFCLI:$NVMEOF_VERSION-arm64 - docker save $QUAY_NVMEOF:$NVMEOF_VERSION-arm64 > nvmeof-arm64.tar - docker save $QUAY_NVMEOFCLI:$NVMEOF_VERSION-arm64 > nvmeof-cli-arm64.tar - - - name: Upload nvmeof-arm64 container images - uses: actions/upload-artifact@v4 - with: - name: container_images_nvmeof_arm64 - path: | - nvmeof-arm64.tar - nvmeof-cli-arm64.tar - pytest: needs: [build, build-ceph] strategy: fail-fast: false matrix: - test: ["cli", "cli_change_lb", "cli_change_keys", "cli_change_ns_visibility", "cli_trash_rbd", "state", "multi_gateway", "server", "grpc", "omap_lock", "log_files", "nsid", "psk", "dhchap", "subsys_grp_name_append", "max_subsystems"] + test: ["cli", "cli_change_lb", "cli_change_keys", "cli_change_ns_visibility", "cli_trash_rbd", "state", "multi_gateway", "server", "grpc", "omap_lock", "log_files", "nsid", "psk", "dhchap", "subsys_grp_name_append", "max_subsystems", "omap_no_read_lock", "omap_read_lock", "omap_read_lock_ignore_errors", "big_omap", "ns_limit"] runs-on: ubuntu-latest env: HUGEPAGES: 512 # for multi gateway test, approx 256 per gateway instance @@ -235,20 +191,6 @@ jobs: # Managing pytest’s output: https://docs.pytest.org/en/7.1.x/how-to/output.html make run SVC="nvmeof" OPTS="--volume=$(pwd)/tests:/src/tests --entrypoint=python3" CMD="-m pytest --show-capture=all -s --full-trace -vv -rA tests/test_${{ matrix.test }}.py" - - name: Check coredump existence - if: success() || failure() - id: check_coredumps - uses: andstor/file-existence-action@20b4d2e596410855db8f9ca21e96fbe18e12930b # v2, pinned to SHA for security reasons - with: - files: "/tmp/coredump/core.*" - - - name: Upload ${{ matrix.test }} test core dumps - if: steps.check_coredumps.outputs.files_exists == 'true' - uses: actions/upload-artifact@v4 - with: - name: core_pytest_${{ matrix.test }} - path: /tmp/coredump/core.* - - name: Copy ceph logs if: success() || failure() run: docker cp ceph:/ceph/out /tmp/out @@ -347,20 +289,6 @@ jobs: run: | ./tests/ha/demo_test.sh bdevperf_${{ matrix.security_protocol }} - - name: Check coredump existence - if: success() || failure() - id: check_coredumps - uses: andstor/file-existence-action@20b4d2e596410855db8f9ca21e96fbe18e12930b # v2, pinned to SHA for security reasons - with: - files: "/tmp/coredump/core.*" - - - name: Upload demo-${{ matrix.security_protocol }} core dumps - if: steps.check_coredumps.outputs.files_exists == 'true' - uses: actions/upload-artifact@v4 - with: - name: core_demo_${{ matrix.security_protocol }} - path: /tmp/coredump/core.* - # For debugging purposes (provides an SSH connection to the runner) # - name: Setup tmate session # uses: mxschmitt/action-tmate@v3 @@ -546,20 +474,6 @@ jobs: bdevperf="/usr/libexec/spdk/scripts/bdevperf.py" make exec SVC=bdevperf OPTS=-T CMD="$bdevperf -v -t $timeout -s $BDEVPERF_SOCKET perform_tests" - - name: Check coredump existence - if: success() || failure() - id: check_coredumps - uses: andstor/file-existence-action@20b4d2e596410855db8f9ca21e96fbe18e12930b # v2, pinned to SHA for security reasons - with: - files: "/tmp/coredump/core.*" - - - name: Upload demo core dumps - if: steps.check_coredumps.outputs.files_exists == 'true' - uses: actions/upload-artifact@v4 - with: - name: core_demo_discovery - path: /tmp/coredump/core.* - - name: Display logs if: success() || failure() run: make logs OPTS='' @@ -646,20 +560,6 @@ jobs: . .env source "tests/ha/${{ matrix.test }}.sh" - - name: Check coredump existence - if: success() || failure() - id: check_coredumps - uses: andstor/file-existence-action@20b4d2e596410855db8f9ca21e96fbe18e12930b # v2, pinned to SHA for security reasons - with: - files: "/tmp/coredump/core.*" - - - name: Upload ha core dumps - if: steps.check_coredumps.outputs.files_exists == 'true' - uses: actions/upload-artifact@v4 - with: - name: core_demo_ha-${{ matrix.test }} - path: /tmp/coredump/core.* - - name: Copy ceph logs if: success() || failure() run: docker cp ceph:/ceph/out /tmp/out @@ -689,11 +589,16 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Download container images - uses: actions/download-artifact@v4 - with: - pattern: container_images_nvmeof - merge-multiple: true + - name: Container_images_nvmeof folder (tar files) download + run: | + for i in {1..5}; do + echo "Attempt $i to download artifacts..." + gh run download --pattern "container_images_nvmeof" && break + echo "Download failed. Retrying in $((i * 5))s..." + sleep $((i * 5)) + done + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Cluster build and Atom tests run if: always() @@ -706,190 +611,56 @@ jobs: if: always() run: ./tests/atom/cpArtifactAndCleanup.sh - - name: Upload artifact for non-schedule events - uses: actions/upload-artifact@v4 - if: always() - with: - name: atom-artifact - path: /home/cephnvme/artifact_m7.tar.gz - - atom-nightly: - needs: [build, build-ceph] - if: github.repository == 'ceph/ceph-nvmeof' && github.event_name == 'schedule' - runs-on: nightlyAtomRunner - timeout-minutes: 1440 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download container images - uses: actions/download-artifact@v4 - with: - pattern: container_images_nvmeof - merge-multiple: true - - - name: Cluster build and Atom nightly tests run - if: always() - run: | - . .env - ACTION_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - stdbuf -oL -eL ./tests/atom/clusterBuildTestsRun.sh $NVMEOF_VERSION $CEPH_BRANCH $CEPH_SHA $ATOM_SHA $ACTION_URL 'nightly' 2>&1 | tee nightly_nvmeof_console.log - exit ${PIPESTATUS[0]} - - - name: Atom artifact build nightly - if: always() - run: ./tests/atom/cpArtifactAndCleanup.sh 'nightly' - - - name: Upload nightly nvmeof console as artifact - if: always() - uses: actions/upload-artifact@v4 - with: - name: nightly-nvmeof-full-console - path: nightly_nvmeof_console.log - - - name: Upload artifact for schedule events - uses: actions/upload-artifact@v4 - if: always() - with: - name: atom-artifact - path: /home/cephnvme/artifact_m8.tar.gz - - push-images-to-ceph-registry: - if: github.event_name == 'release' - needs: [pytest, demo, discovery, ha, atom] - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download container images (nvmeof) - uses: actions/download-artifact@v4 - with: - pattern: container_images_nvmeof* - merge-multiple: true - - - name: Download container image (spdk) - uses: actions/download-artifact@v4 - with: - pattern: container_image_spdk - merge-multiple: true - - - name: Load container images - run: | - docker load < nvmeof.tar - docker load < nvmeof-cli.tar - docker load < nvmeof-arm64.tar - docker load < nvmeof-cli-arm64.tar - docker load < spdk.tar - - - name: Login to quay.io - uses: docker/login-action@v2 - with: - registry: ${{ vars.CONTAINER_REGISTRY }} - username: '${{ vars.CONTAINER_REGISTRY_USERNAME }}' - password: '${{ secrets.CONTAINER_REGISTRY_PASSWORD }}' - - - name: Publish nvmeof containers and spdk when release/tag is created - run: | - make push - make push TAG_SUFFIX="-arm64" - make push-manifest-list - - push-devel-image-to-ceph-registry: - if: github.event_name == 'push' && github.ref == 'refs/heads/devel' - needs: [pytest, demo, discovery, ha, atom] - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download container images - uses: actions/download-artifact@v4 - with: - pattern: container_images_nvmeof* - merge-multiple: true - - - name: Load container images - run: | - docker load < nvmeof.tar - docker load < nvmeof-cli.tar - - - name: Login to quay.io - uses: docker/login-action@v2 - with: - registry: ${{ vars.CONTAINER_REGISTRY }} - username: '${{ vars.CONTAINER_REGISTRY_USERNAME }}' - password: '${{ secrets.CONTAINER_REGISTRY_PASSWORD }}' - - - name: Publish nvmeof containers when merged to devel - run: | - . .env - for image in nvmeof nvmeof-cli; do - docker tag $CONTAINER_REGISTRY/$image:$NVMEOF_VERSION $CONTAINER_REGISTRY/$image:devel - docker push $CONTAINER_REGISTRY/$image:devel - done - - pull-and-run-images: - needs: [push-images-to-ceph-registry] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: setup - run: make setup - - - name: Pull Docker images from Quay.io - run: | - make pull || true # Continue even if some images fail to pull - - - name: List pulled images - run: docker images - - - name: Verify downloaded images - run: | - . .env - REQUIRED_IMAGES=( - $QUAY_SPDK:$SPDK_VERSION - $QUAY_NVMEOFCLI:$NVMEOF_CLI_VERSION - $QUAY_NVMEOF:$NVMEOF_VERSION - $QUAY_CEPH:$CEPH_VERSION - ) - - echo "Checking which required images are available:" - for image in "${REQUIRED_IMAGES[@]}"; do - if docker image inspect "$image" > /dev/null 2>&1; then - echo "✅ $image is available." - else - echo "❌ $image is missing." - exit 1 # Fail the job - fi - done - - - name: Run Docker containers - run: make up - - - name: Wait for the Gateway to be listening - timeout-minutes: 3 - run: | - . .env - - echo using gateway $NVMEOF_IP_ADDRESS port $NVMEOF_GW_PORT - until nc -z $NVMEOF_IP_ADDRESS $NVMEOF_GW_PORT; do - echo -n . - sleep ${{ env.WAIT_INTERVAL_SECS }} - done - - - name: check health - run: make ps - - - name: Run demo - run: make demo - - - name: Tear down - if: success() || failure() - run: | - make down - make clean + # TODO: investigating upluad to artifact issue + # - name: Upload artifact for non-schedule events + # uses: actions/upload-artifact@v4 + # if: always() + # with: + # name: atom-artifact + # path: /home/cephnvme/artifact_m7.tar.gz + + # atom-nightly: + # needs: [build, build-ceph] + # if: github.repository == 'ceph/ceph-nvmeof' && github.event_name == 'schedule' + # runs-on: nightlyAtomRunner + # timeout-minutes: 1440 + # steps: + # - name: Checkout code + # uses: actions/checkout@v4 + + # - name: Container_images_nvmeof folder (tar files) download + # run: | + # for i in {1..5}; do + # echo "Attempt $i to download artifacts..." + # gh run download --pattern "container_images_nvmeof" && break + # echo "Download failed. Retrying in $((i * 5))s..." + # sleep $((i * 5)) + # done + # env: + # GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # - name: Cluster build and Atom nightly tests run + # if: always() + # run: | + # . .env + # ACTION_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + # stdbuf -oL -eL ./tests/atom/clusterBuildTestsRun.sh $NVMEOF_VERSION $CEPH_BRANCH $CEPH_SHA $ATOM_SHA $ACTION_URL 'nightly' 2>&1 | tee nightly_nvmeof_console.log + # exit ${PIPESTATUS[0]} + + # - name: Atom artifact build nightly + # if: always() + # run: ./tests/atom/cpArtifactAndCleanup.sh 'nightly' + + # - name: Upload nightly nvmeof console as artifact + # if: always() + # uses: actions/upload-artifact@v4 + # with: + # name: atom-nightly-nvmeof-full-console + # path: atom_nightly_nvmeof_console.log + + # - name: Upload artifact for schedule events + # uses: actions/upload-artifact@v4 + # if: always() + # with: + # name: atom-artifact + # path: /home/cephnvme/artifact_m8.tar.gz diff --git a/.github/workflows/check-deps.yml b/.github/workflows/check-deps.yml deleted file mode 100644 index 0c22d9f2edc..00000000000 --- a/.github/workflows/check-deps.yml +++ /dev/null @@ -1,16 +0,0 @@ -# Blocks PR merging if PR description contains "depends on" or "blocked by" -# pointing to another PR until that PR is merged - -name: Check Dependencies -on: [pull_request] - -jobs: - check-deps: - runs-on: ubuntu-latest - name: Check Dependencies - steps: - # https://github.com/marketplace/actions/pr-dependency-check - # Pinned to v1.2.4 SHA-1 for security reasons - - uses: gregsdennis/dependencies-action@71c5cc14fab62389a600c0a6e37584dc4916799c - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..f57582758b2 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,133 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "devel", "*", "reef_7.1", "squid_8.0", "squid_8.1", "tentacle_9.0" ] + pull_request: + branches: [ "devel", "*", "reef_7.1", "squid_8.0", "squid_8.1", "tentacle_9.0" ] + schedule: + - cron: '38 0 * * 6' + +jobs: + detect-go: + name: Detect Go Code + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + has-go: ${{ steps.check.outputs.has-go }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Check for Go code + id: check + run: | + if find . -name "*.go" -not -path "./vendor/*" -not -path "./.git/*" | grep -q .; then + echo "has-go=true" >> $GITHUB_OUTPUT + echo "✅ Go code found - will analyze" + else + echo "has-go=false" >> $GITHUB_OUTPUT + echo "⏭️ No Go code - skipping Go analysis" + fi + + analyze: + name: Analyze (${{ matrix.language }}) + needs: detect-go + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ubuntu-latest + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" + + analyze-go: + name: Analyze (go) + needs: detect-go + if: needs.detect-go.outputs.has-go == 'true' + runs-on: ubuntu-latest + permissions: + security-events: write + packages: read + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: go + build-mode: autobuild + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:go" diff --git a/Dockerfile b/Dockerfile index 7157bddc080..29c0cdc1ada 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ RUN \ --mount=type=cache,target=/var/lib/dnf \ dnf install -y python3-rados && \ dnf install -y python3-rbd && \ + dnf install -y gdb && \ dnf config-manager --set-enabled crb && \ dnf install -y ceph-mon-client-nvmeof ENTRYPOINT ["python3", "-m", "control"] diff --git a/Dockerfile.spdk b/Dockerfile.spdk index 257551d1cc7..7e3af641b55 100644 --- a/Dockerfile.spdk +++ b/Dockerfile.spdk @@ -106,7 +106,7 @@ ARG SPDK_CEPH_VERSION \ ARG SPDK_CENTOS_BASE="https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/" # This would become obsolete as the release rolls out new packages -ARG SPDK_CENTOS_REPO_VER="9.0-21.el9" +ARG SPDK_CENTOS_REPO_VER="9.0-28.el9" ARG SPDK_PKGDEP_ARGS \ SPDK_CONFIGURE_ARGS \ diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index 7a846caf3a0..1c076b90a08 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -16,11 +16,15 @@ enable_auth = False state_update_notify = True state_update_timeout_in_msec = 2000 state_update_interval_sec = 5 +break_update_interval_sec = 25 enable_spdk_discovery_controller = False encryption_key = /etc/ceph/encryption.key rebalance_period_sec = 7 max_gws_in_grp = 16 max_ns_to_change_lb_grp = 8 +#abort_on_errors = True +#omap_file_ignore_unlock_errors = False +#omap_file_lock_on_read = True #omap_file_lock_duration = 20 #omap_file_lock_retries = 30 #omap_file_lock_retry_sleep_interval = 1.0 @@ -39,7 +43,7 @@ max_ns_to_change_lb_grp = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_hosts = 2048 -#max_namespaces = 1024 +#max_namespaces = 2048 #max_namespaces_per_subsystem = 256 #max_hosts_per_subsystem = 32 @@ -59,6 +63,7 @@ log_level=debug [discovery] addr = 0.0.0.0 port = 8009 +#abort_on_errors = True [ceph] pool = rbd diff --git a/control/cephutils.py b/control/cephutils.py index 0aae503c58e..f102591be97 100644 --- a/control/cephutils.py +++ b/control/cephutils.py @@ -80,10 +80,13 @@ def get_number_created_gateways(self, pool, group, caching=True): if pos != -1: data = json.loads(conv_str) self.rebalance_supported = True - self.rebalance_ana_group = data.get("rebalance_ana_group", None) + self.rebalance_ana_group = data.get("rebalance_ana_group", 0) + if self.rebalance_ana_group == 0: + self.logger.info("illegal rebalance ana group 0") + self.rebalance_supported = False self.num_gws = data.get("num gws", None) - self.logger.info(f"Rebalance ana_group: {self.rebalance_ana_group}, " - f"num-gws: {self.num_gws}") + self.logger.debug(f"Rebalance ana_group: {self.rebalance_ana_group}, " + f"num-gws: {self.num_gws}") else: self.rebalance_supported = False pos = conv_str.find("[") @@ -95,7 +98,7 @@ def get_number_created_gateways(self, pool, group, caching=True): self.logger.debug(f"new_str : {new_str}") for x in int_str_list: self.anagroup_list.append(int(x)) - self.logger.info(f"ANA group list: {self.anagroup_list}") + self.logger.debug(f"ANA group list: {self.anagroup_list}") else: self.logger.warning("GWs not found") diff --git a/control/cli.py b/control/cli.py index bb299dc29ba..408f5cc7fa7 100644 --- a/control/cli.py +++ b/control/cli.py @@ -2346,7 +2346,8 @@ def ns_add_host(self, args): try: add_host_req = pb2.namespace_add_host_req(subsystem_nqn=args.subsystem, nsid=args.nsid, - host_nqn=one_host_nqn) + host_nqn=one_host_nqn, + force=args.force) ret = self.stub.namespace_add_host(add_host_req) except Exception as ex: ret = pb2.req_status(status=errno.EINVAL, @@ -2654,6 +2655,10 @@ def ns_set_rbd_trash_image(self, args): ns_add_host_args_list = ns_common_args + [ argument("--nsid", help="Namespace ID", type=int, required=True), argument("--host-nqn", "-t", help="Host NQN list", nargs="+", required=True), + argument("--force", + help="Allow adding the host to the namespace even if the host " + "has no access to the subsystem", + action='store_true', required=False), ] ns_del_host_args_list = ns_common_args + [ argument("--nsid", help="Namespace ID", type=int, required=True), diff --git a/control/discovery.py b/control/discovery.py index 8862463d3e9..c3db0399f36 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -10,7 +10,7 @@ import argparse import json from .config import GatewayConfig -from .state import GatewayState, LocalGatewayState, OmapGatewayState, GatewayStateHandler +from .state import GatewayState, LocalGatewayState, OmapLock, OmapGatewayState, GatewayStateHandler from .utils import GatewayLogger from .utils import GatewayUtilsCrypto @@ -327,7 +327,11 @@ def __init__(self, config): self.version = 1 self.config = config self.lock = threading.Lock() - self.omap_state = OmapGatewayState(self.config, f"discovery-{socket.gethostname()}") + self.abort_on_error = self.config.getboolean_with_default("discovery", + "abort_on_errors", + True) + self.omap_state = OmapGatewayState(self.config, None, f"discovery-{socket.gethostname()}") + self.omap_state.abort_on_error = self.abort_on_error self.gw_logger_object = GatewayLogger(config) self.logger = self.gw_logger_object.logger @@ -344,6 +348,7 @@ def __init__(self, config): assert 0 self.logger.info(f"discovery addr: {self.discovery_addr} port: {self.discovery_port}") + self.omap_lock = None self.sock = None self.conn_vals = {} self.connection_counter = 1 @@ -354,7 +359,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): if self.omap_state: - self.omap_state.cleanup_omap() + self.omap_state.cleanup_omap(self.omap_lock) self.omap_state = None if self.selector: @@ -390,8 +395,14 @@ def __exit__(self, exc_type, exc_value, traceback): def _read_all(self) -> Dict[str, str]: """Reads OMAP and returns dict of all keys and values.""" - omap_dict = self.omap_state.get_state() - return omap_dict + try: + omap_dict = self.omap_state.get_state() + return omap_dict + except Exception: + self.logger.exception("Failure getting OMAP state for discovery") + if self.abort_on_error: + raise + return {} def _get_vals(self, omap_dict, prefix): """Read values from the OMAP dict.""" @@ -416,6 +427,9 @@ def reply_initialize(self, conn): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply initialize connection request.") return 0 @@ -469,6 +483,9 @@ def reply_fc_cmd_connect(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply connect request.") return 0 @@ -534,6 +551,9 @@ def reply_fc_cmd_prop_get(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply property get request.") return 0 @@ -572,6 +592,9 @@ def reply_fc_cmd_prop_set(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply property set request.") return 0 @@ -636,6 +659,9 @@ def reply_identify(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply identify request.") return 0 @@ -673,6 +699,9 @@ def reply_set_feature(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply set feature request.") return 0 @@ -709,6 +738,9 @@ def reply_get_feature(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply get feature request.") return 0 @@ -857,6 +889,9 @@ def reply_get_log_page(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply get log page request.") return 0 @@ -890,6 +925,9 @@ def reply_keep_alive(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.debug("reply keep alive request.") return 0 @@ -917,6 +955,9 @@ def reply_not_supported(self, conn, data, cmd_id): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return -1 + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return -1 self.logger.warning("reply not supported opcode.") return 0 @@ -938,7 +979,7 @@ def store_async(self, conn, data, cmd_id): self_conn.recv_async = True self_conn.async_cmd_id = cmd_id - def _state_notify_update(self, update, is_add_req): + def _state_notify_update(self, update, is_add_req, break_interval): """Notify and reply async event.""" should_send_async_event = False @@ -971,6 +1012,9 @@ def _state_notify_update(self, update, is_add_req): except BrokenPipeError: self.logger.error("client disconnected unexpectedly.") return + except OSError as ex: + self.logger.exception(f"got OS error {ex.errno}: {ex.strerror}") + return self.logger.debug("notify and reply async request.") self.conn_vals[key].recv_async = False return @@ -1029,8 +1073,8 @@ def nvmeof_tcp_connection(self, conn, mask): self_conn.recv_buffer += message else: return - except BlockingIOError: - self.logger.error("recived data failed.") + except OSError as ex: + self.logger.error(f"Recived data failed. {ex.errno}: {ex.strerror}") while True: if len(self_conn.recv_buffer) < 8: @@ -1158,6 +1202,7 @@ def start_service(self): self.omap_state, self._state_notify_update, dummy_crypto, f"discovery-{socket.gethostname()}") + self.omap_lock = OmapLock(gateway_state, None) gateway_state.start_update() try: diff --git a/control/grpc.py b/control/grpc.py index 215c5bdf83a..b7ee66e868d 100644 --- a/control/grpc.py +++ b/control/grpc.py @@ -56,6 +56,31 @@ MONITOR_POLLING_RATE_SEC = 2 # monitor polls gw each 2 seconds +class SubsystemsCache: + def __init__(self): + self.cache_lock = threading.Lock() + with self.cache_lock: + self.subsystems_info = pb2.subsystems_info(subsystems=[]) + + def get_subsystems(self) -> pb2.subsystems_info: + with self.cache_lock: + return self.subsystems_info + + def get_one_subsystem(self, subsys: str) -> list[pb2.subsystem]: + if not subsys: + return [] + + with self.cache_lock: + for s in self.subsystems_info.subsystems: + if s.nqn == subsys: + return [s] + return [] + + def set_subsystems(self, subsystems: pb2.subsystems_info): + with self.cache_lock: + self.subsystems_info = subsystems + + class BdevStatus: def __init__(self, status, error_message, bdev_name="", rbd_pool=None, rbd_image_name=None, trash_image=False): @@ -421,105 +446,115 @@ class NamespacesLocalList: def __init__(self): self.namespace_list = defaultdict(dict) + self.namespace_list_lock = threading.Lock() def remove_namespace(self, nqn, nsid=None): - if nqn in self.namespace_list: - if nsid: - if nsid in self.namespace_list[nqn]: - self.namespace_list[nqn].pop(nsid, None) - if len(self.namespace_list[nqn]) == 0: - self.namespace_list.pop(nqn, None) # last ns of subsystem was removed - else: - self.namespace_list.pop(nqn, None) + with self.namespace_list_lock: + if nqn in self.namespace_list: + if nsid: + if nsid in self.namespace_list[nqn]: + self.namespace_list[nqn].pop(nsid, None) + if len(self.namespace_list[nqn]) == 0: + self.namespace_list.pop(nqn, None) # last ns of subsystem was removed + else: + self.namespace_list.pop(nqn, None) def add_namespace(self, nqn, nsid, bdev, uuid, anagrpid, auto_visible, pool, image, trash_image): if not bdev: bdev = GatewayService.find_unique_bdev_name(uuid) - self.namespace_list[nqn][nsid] = NamespaceInfo(nsid, bdev, uuid, anagrpid, - auto_visible, pool, image, trash_image) + with self.namespace_list_lock: + self.namespace_list[nqn][nsid] = NamespaceInfo(nsid, bdev, uuid, anagrpid, + auto_visible, pool, image, trash_image) def find_namespace(self, nqn, nsid, uuid=None) -> NamespaceInfo: - if nqn not in self.namespace_list: - return NamespacesLocalList.EMPTY_NAMESPACE + with self.namespace_list_lock: + if nqn not in self.namespace_list: + return NamespacesLocalList.EMPTY_NAMESPACE - # if we have nsid, use it as the key - if nsid: - if nsid in self.namespace_list[nqn]: - return self.namespace_list[nqn][nsid] - return NamespacesLocalList.EMPTY_NAMESPACE + # if we have nsid, use it as the key + if nsid: + if nsid in self.namespace_list[nqn]: + return self.namespace_list[nqn][nsid] + return NamespacesLocalList.EMPTY_NAMESPACE - if uuid: - for ns in self.namespace_list[nqn]: - if NamespaceInfo.are_uuids_equal(uuid, self.namespace_list[nqn][ns].uuid): - return self.namespace_list[nqn][ns] + if uuid: + for ns in self.namespace_list[nqn]: + if NamespaceInfo.are_uuids_equal(uuid, self.namespace_list[nqn][ns].uuid): + return self.namespace_list[nqn][ns] return NamespacesLocalList.EMPTY_NAMESPACE def get_namespace_count(self, nqn, auto_visible=None, min_hosts=0) -> int: - if nqn and nqn not in self.namespace_list: - return 0 + with self.namespace_list_lock: + if nqn and nqn not in self.namespace_list: + return 0 - if nqn: - subsystems = [nqn] - else: - subsystems = self.namespace_list.keys() + if nqn: + subsystems = [nqn] + else: + subsystems = self.namespace_list.keys() - ns_count = 0 - for one_subsys in subsystems: - for nsid in self.namespace_list[one_subsys]: - ns = self.namespace_list[one_subsys][nsid] - if ns.empty(): - continue - if auto_visible is not None: - if ns.auto_visible == auto_visible and ns.host_count() >= min_hosts: - ns_count += 1 - else: - if ns.host_count() >= min_hosts: - ns_count += 1 + ns_count = 0 + for one_subsys in subsystems: + for nsid in self.namespace_list[one_subsys]: + ns = self.namespace_list[one_subsys][nsid] + if ns.empty(): + continue + if auto_visible is not None: + if ns.auto_visible == auto_visible and ns.host_count() >= min_hosts: + ns_count += 1 + else: + if ns.host_count() >= min_hosts: + ns_count += 1 return ns_count def get_namespace_infos_for_anagrpid(self, nqn: str, anagrpid: int) -> Iterator[NamespaceInfo]: """Yield NamespaceInfo instances for a given nqn and anagrpid.""" - if nqn in self.namespace_list: - for ns_info in self.namespace_list[nqn].values(): - if ns_info.anagrpid == anagrpid: - yield ns_info + + with self.namespace_list_lock: + if nqn in self.namespace_list: + for ns_info in self.namespace_list[nqn].values(): + if ns_info.anagrpid == anagrpid: + yield ns_info def get_all_namespaces_by_ana_group_id(self, anagrpid): ns_list = [] # Loop through all nqn values in the namespace list - for nqn in self.namespace_list: - for nsid in self.namespace_list[nqn]: - ns = self.namespace_list[nqn][nsid] - if ns.empty(): - continue - if ns.anagrpid == anagrpid: - ns_list.append((nsid, nqn)) # list of tupples + with self.namespace_list_lock: + for nqn in self.namespace_list: + for nsid in self.namespace_list[nqn]: + ns = self.namespace_list[nqn][nsid] + if ns.empty(): + continue + if ns.anagrpid == anagrpid: + ns_list.append((nsid, nqn)) # list of tupples return ns_list def get_ana_group_id_by_nsid_subsys(self, nqn, nsid): - if nqn not in self.namespace_list: - return 0 - if nsid not in self.namespace_list[nqn]: - return 0 - ns = self.namespace_list[nqn][nsid] - if ns.empty(): - return 0 - return ns.anagrpid + with self.namespace_list_lock: + if nqn not in self.namespace_list: + return 0 + if nsid not in self.namespace_list[nqn]: + return 0 + ns = self.namespace_list[nqn][nsid] + if ns.empty(): + return 0 + return ns.anagrpid def get_subsys_namespaces_by_ana_group_id(self, nqn, anagrpid): ns_list = [] - if nqn not in self.namespace_list: - return ns_list + with self.namespace_list_lock: + if nqn not in self.namespace_list: + return ns_list - for nsid in self.namespace_list[nqn]: - ns = self.namespace_list[nqn][nsid] - if ns.empty(): - continue - if ns.anagrpid == anagrpid: - ns_list.append(ns) + for nsid in self.namespace_list[nqn]: + ns = self.namespace_list[nqn][nsid] + if ns.empty(): + continue + if ns.anagrpid == anagrpid: + ns_list.append(ns) return ns_list @@ -547,14 +582,17 @@ class GatewayService(pb2_grpc.GatewayServicer): DHCHAP_CONTROLLER_PREFIX = "dhchap_ctrlr" KEYS_DIR = "/var/tmp" MAX_SUBSYSTEMS_DEFAULT = 128 - MAX_NAMESPACES_DEFAULT = 1024 + MAX_NAMESPACES_DEFAULT = 2048 MAX_NAMESPACES_PER_SUBSYSTEM_DEFAULT = 256 + # The actual highest value seems to be 3647, so pick a lower value + MAX_VALUE_FOR_MAX_NAMESPACES_PER_SUBSYSTEM = 2048 MAX_HOSTS_PER_SUBSYS_DEFAULT = 128 MAX_HOSTS_DEFAULT = 2048 def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rpc_lock, omap_lock: OmapLock, group_id: int, spdk_rpc_client, - spdk_rpc_subsystems_client, ceph_utils: CephUtils) -> None: + spdk_rpc_subsystems_client, ceph_utils: CephUtils, + set_gateway_exit_message) -> None: """Constructor""" self.gw_logger_object = GatewayLogger(config) self.logger = self.gw_logger_object.logger @@ -608,6 +646,12 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, "gateway", "max_namespaces_per_subsystem", GatewayService.MAX_NAMESPACES_PER_SUBSYSTEM_DEFAULT) + biggest_max_ns_per_subsys = GatewayService.MAX_VALUE_FOR_MAX_NAMESPACES_PER_SUBSYSTEM + if self.max_namespaces_per_subsystem > biggest_max_ns_per_subsys: + self.logger.error(f"Max namespaces per subsystem can't be greater than " + f"{biggest_max_ns_per_subsys}, will use " + f"this value instead") + self.max_namespaces_per_subsystem = biggest_max_ns_per_subsys self.max_hosts_per_subsystem = self.config.getint_with_default( "gateway", "max_hosts_per_subsystem", @@ -621,6 +665,13 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, "gateway", "enable_key_encryption", True) + # This is an option for development, normally we should abort the gateway on + # errors, this should only be used in case of some catastrophe when we + # want to keep the gateway up + self.abort_on_errors = self.config.getboolean_with_default( + "gateway", + "abort_on_errors", + True) self.ana_map = defaultdict(dict) self.ana_grp_state = {} self.ana_grp_ns_load = {} @@ -649,12 +700,14 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, self.cluster_allocator = get_cluster_allocator(config, self) self.subsys_max_ns = {} self.subsys_serial = {} + self.subsystems_cache = SubsystemsCache() self.host_info = SubsystemHostAuth() self.up_and_running = True self.rebalance = Rebalance(self) self.spdk_version = None self.spdk_qos_timeslice = self.config.getint_with_default("spdk", "qos_timeslice_in_usecs", None) + self.set_gateway_exit_message = set_gateway_exit_message def get_directories_for_key_file(self, key_type: str, subsysnqn: str, create_dir: bool = False) -> []: @@ -867,12 +920,33 @@ def set_cluster_nonce(self, name: str, nonce: str) -> None: self.logger.info(f"Allocated cluster {name=} {nonce=}") self.cluster_nonce[name] = nonce + def abort_gateway_if_needed(self, where: str) -> None: + if self.abort_on_errors: + msg = f"Will abort gateway because of an error in {where}" + self.logger.critical(msg) + self.abort_on_errors = False + self.up_and_running = False + if self.set_gateway_exit_message is not None: + if not self.set_gateway_exit_message(msg): + self.logger.warning(f"Can't get an indication about the gateway aborting. " + f"Will continue after an error in {where}") + else: + self.logger.warning(f"No gateway exit function set, will continue after " + f"an error in {where}") + else: + self.logger.warning(f"Gateway abort is disabled, will continue after " + f"an error in {where}") + def _grpc_function_with_lock(self, func, request, context): with self.rpc_lock: rc = func(request, context) if not self.omap_lock.omap_file_disable_unlock: - assert not self.omap_lock.locked(), f"OMAP is still locked when " \ - f"we're out of function {func}" + assert not self.omap_lock.write_locked_by_me(), \ + f"OMAP is still locked when exiting function {func.__name__}()\n" \ + f"locked by: {self.omap_lock.locked_by}, " \ + f"with cookie: {self.omap_lock.lock_cookie}" \ + f"current thread id: {threading.get_native_id()} " \ + f"locked: {self.omap_lock.is_exclusively_locked}" return rc def execute_grpc_function(self, func, request, context): @@ -887,8 +961,16 @@ def execute_grpc_function(self, func, request, context): self.logger.error(errmsg) return pb2.req_status(status=errno.ESHUTDOWN, error_message=errmsg) - return self.omap_lock.execute_omap_locking_function( - self._grpc_function_with_lock, func, request, context) + try: + rc = self.omap_lock.execute_omap_locking_function( + self._grpc_function_with_lock, func, request, context) + except Exception: + self.logger.exception(f"Failure while executing {func.__name__}()") + self.abort_gateway_if_needed(f"{func.__name__}()") + return pb2.req_status(status=errno.ESHUTDOWN, + error_message="Shutting down server") + + return rc def create_bdev(self, anagrp: int, name, uuid, rbd_pool_name, rbd_image_name, block_size, create_image, trash_image, rbd_image_size, context, peer_msg=""): @@ -1167,6 +1249,13 @@ def create_subsystem_safe(self, request, context): nqn=request.subsystem_nqn) if request.max_namespaces: + if request.max_namespaces > GatewayService.MAX_VALUE_FOR_MAX_NAMESPACES_PER_SUBSYSTEM: + errmsg = f"{create_subsystem_error_prefix}: Max namespaces can't be greater " \ + f"than {GatewayService.MAX_VALUE_FOR_MAX_NAMESPACES_PER_SUBSYSTEM}" + self.logger.error(errmsg) + return pb2.subsys_status(status=errno.EINVAL, + error_message=errmsg, + nqn=request.subsystem_nqn) if request.max_namespaces > self.max_namespaces: self.logger.warning(f"The requested max number of namespaces for subsystem " f"{request.subsystem_nqn} ({request.max_namespaces}) is " @@ -1233,6 +1322,14 @@ def create_subsystem_safe(self, request, context): if not request.max_namespaces: request.max_namespaces = self.max_namespaces_per_subsystem + if request.max_namespaces >= Rebalance.INVALID_LOAD_BALANCING_GROUP: + errmsg = f"{create_subsystem_error_prefix}: Maximal number of namespaces " \ + f"({request.max_namespaces}) is too big" + self.logger.error(errmsg) + return pb2.subsys_status(status=errno.E2BIG, + error_message=errmsg, + nqn=request.subsystem_nqn) + if not request.serial_number: random.seed() randser = random.randint(2, 99999999999999) @@ -1474,7 +1571,7 @@ def delete_subsystem(self, request, context=None): # We found a namespace still using this subsystem and --force wasn't used fail with EBUSY if not request.force and len(ns_list) > 0: errmsg = f"{delete_subsystem_error_prefix}: Namespace {ns_list[0]} is still using " \ - f"the subsystem. Either remove it or use the '--force' command line option" + f"the subsystem. Either remove it or use the \"--force\" command line option" self.logger.error(errmsg) return pb2.req_status(status=errno.EBUSY, error_message=errmsg) @@ -1520,8 +1617,11 @@ def create_namespace(self, subsystem_nqn, bdev_name, nsid, anagrpid, uuid, auto_visible, rbd_pool, rbd_image_name, trash_image, context): """Adds a namespace to a subsystem.""" - if context: - assert self.omap_lock.locked(), "OMAP is unlocked when calling create_namespace()" + assert context is None or self.omap_lock.write_locked_by_me(), \ + f"OMAP is unlocked when calling create_namespace()\n" \ + f"in thread: {threading.get_native_id()}. Locked by: " \ + f"{self.omap_lock.locked_by}, with cookie: {self.omap_lock.lock_cookie}, " \ + f"locked: {self.omap_lock.is_exclusively_locked}" assert (rbd_pool and rbd_image_name) or ((not rbd_pool) and (not rbd_image_name)), \ "RBD pool and image name should either be both set or both empty" @@ -1741,7 +1841,7 @@ def choose_anagrpid_for_namespace(self, nsid) -> int: # still no namespaces in this ana-group - probably the new GW added self.logger.info(f"New GW created: chosen ana group {ana_grp} for ns {nsid} ") return ana_grp - min_load = 2000 + min_load = Rebalance.INVALID_LOAD_BALANCING_GROUP chosen_ana_group = 0 for ana_grp in self.ana_grp_ns_load: if ana_grp in grps_list: @@ -1970,6 +2070,7 @@ def namespace_change_load_balancing_group_safe(self, request, context): try: state_ns = state[ns_key] ns_entry = json.loads(state_ns) + GatewayService.fill_namespace_missing_fields(ns_entry) except Exception: errmsg = f"{change_lb_group_failure_prefix}: Can't find entry for " \ f"namespace {request.nsid} in {request.subsystem_nqn}" @@ -1979,12 +2080,12 @@ def namespace_change_load_balancing_group_safe(self, request, context): anagrp = ns_entry["anagrpid"] gw_id = self.ceph_utils.get_gw_id_owner_ana_group( self.gateway_pool, self.gateway_group, anagrp) - self.logger.debug(f"ANA group of ns#{request.nsid} - {anagrp} is owned by " - f"gateway {gw_id}, self.name is {self.gateway_name}") - if self.gateway_name != gw_id: - errmsg = f"ANA group of ns#{request.nsid} - {anagrp} is owned by " \ - f"gateway {gw_id} so try this command from it, this gateway " \ - f"name is {self.gateway_name}" + self.logger.debug(f"Load balancing group of ns#{request.nsid} - {anagrp} is " + f"owned by gateway {gw_id}, self.name is {self.gateway_name}") + if gw_id is not None and self.gateway_name != gw_id: + errmsg = f"Load balancing group of ns#{request.nsid} - {anagrp} is " \ + f"owned by gateway {gw_id}, try running the command from " \ + f"there.\nThis gateway name is {self.gateway_name}" self.logger.error(errmsg) return pb2.req_status(status=errno.EEXIST, error_message=errmsg) @@ -2060,6 +2161,21 @@ def namespace_change_load_balancing_group_safe(self, request, context): return pb2.req_status(status=0, error_message=os.strerror(0)) + @staticmethod + def fill_namespace_missing_fields(ns: pb2.namespace_add_req): + try: + ns["trash_image"] + except KeyError: + ns["trash_image"] = False + try: + ns["no_auto_visible"] + except KeyError: + ns["no_auto_visible"] = False + try: + ns["force"] + except KeyError: + ns["force"] = False + def namespace_change_load_balancing_group(self, request, context=None): """Changes a namespace load balancing group.""" return self.execute_grpc_function(self.namespace_change_load_balancing_group_safe, @@ -2150,6 +2266,7 @@ def namespace_change_visibility_safe(self, request, context): try: state_ns = state[ns_key] ns_entry = json.loads(state_ns) + GatewayService.fill_namespace_missing_fields(ns_entry) if ns_entry["no_auto_visible"] == (not request.auto_visible): self.logger.warning(f"No change to namespace {request.nsid} in " f"{request.subsystem_nqn} visibility, nothing to do") @@ -2274,6 +2391,7 @@ def namespace_set_rbd_trash_image_safe(self, request, context=None): try: state_ns = state[ns_key] ns_entry = json.loads(state_ns) + GatewayService.fill_namespace_missing_fields(ns_entry) if ns_entry["trash_image"] == request.trash_image: self.logger.warning(f"Namespace {request.nsid} in {request.subsystem_nqn} " f"already has the RBD trash image flag set to the " @@ -2325,9 +2443,11 @@ def remove_namespace_from_state(self, nqn, nsid, context): if not context: return pb2.req_status(status=0, error_message=os.strerror(0)) - # If we got here context is not None, so we must hold the OMAP lock - assert self.omap_lock.locked(), "OMAP is unlocked when calling " \ - "remove_namespace_from_state()" + assert context is None or self.omap_lock.write_locked_by_me(), \ + f"OMAP is unlocked when calling remove_namespace_from_state()\n" \ + f"in thread: {threading.get_native_id()}. Locked by: " \ + f"{self.omap_lock.locked_by}, with cookie: {self.omap_lock.lock_cookie}, " \ + f"locked: {self.omap_lock.is_exclusively_locked}" # Update gateway state try: @@ -2356,8 +2476,12 @@ def remove_namespace_from_state(self, nqn, nsid, context): def remove_namespace(self, subsystem_nqn, nsid, context): """Removes a namespace from a subsystem.""" - if context: - assert self.omap_lock.locked(), "OMAP is unlocked when calling remove_namespace()" + assert context is None or self.omap_lock.write_locked_by_me(), \ + f"OMAP is unlocked when calling remove_namespace()\n" \ + f"in thread: {threading.get_native_id()}. Locked by: " \ + f"{self.omap_lock.locked_by}, with cookie: {self.omap_lock.lock_cookie}, " \ + f"locked: {self.omap_lock.is_exclusively_locked}" + peer_msg = self.get_peer_message(context) namespace_failure_prefix = f"Failure removing namespace {nsid} from {subsystem_nqn}" self.logger.info(f"Received request to remove namespace {nsid} from " @@ -2965,7 +3089,7 @@ def namespace_add_host_safe(self, request, context): failure_prefix = f"Failure adding host {request.host_nqn} to namespace " \ f"{request.nsid} on {request.subsystem_nqn}" self.logger.info(f"Received request to add host {request.host_nqn} to namespace " - f"{request.nsid} on {request.subsystem_nqn}, " + f"{request.nsid} on {request.subsystem_nqn}, force: {request.force}, " f"context: {context}{peer_msg}") if not request.nsid: @@ -3036,6 +3160,22 @@ def namespace_add_host_safe(self, request, context): self.logger.error(errmsg) return pb2.req_status(status=errno.E2BIG, error_message=errmsg) + host_allowed = self.host_info.is_any_host_allowed(request.subsystem_nqn) + if not host_allowed: + host_allowed = self.host_info.does_host_exist(request.subsystem_nqn, request.host_nqn) + + if not host_allowed: + if request.force: + self.logger.info(f"Host {request.host_nqn} is not allowed to access " + f"subsystem {request.subsystem_nqn} but it will be added " + f"to namespace {request.nsid} as the \"--force\" parameter " + f"was used") + else: + errmsg = f"{failure_prefix}: Host is not allowed to access the subsystem, " \ + f"use the \"--force\" parameter to add the host anyway" + self.logger.error(errmsg) + return pb2.req_status(status=errno.EACCES, error_message=errmsg) + omap_lock = self.omap_lock.get_omap_lock_to_use(context) with omap_lock: ret = rpc_nvmf.nvmf_ns_visible( @@ -3594,8 +3734,12 @@ def remove_host_from_state(self, subsystem_nqn, host_nqn, context): if not context: return pb2.req_status(status=0, error_message=os.strerror(0)) - if context: - assert self.omap_lock.locked(), "OMAP is unlocked when calling remove_host_from_state()" + assert context is None or self.omap_lock.write_locked_by_me(), \ + f"OMAP is unlocked when calling remove_host_from_state()\n" \ + f"in thread: {threading.get_native_id()}. Locked by: " \ + f"{self.omap_lock.locked_by}, with cookie: {self.omap_lock.lock_cookie}, " \ + f"locked: {self.omap_lock.is_exclusively_locked}" + # Update gateway state try: self.gateway_state.remove_host(subsystem_nqn, host_nqn) @@ -3919,36 +4063,57 @@ def list_hosts_safe(self, request, context): peer_msg = self.get_peer_message(context) self.logger.info(f"Received request to list hosts for " f"{request.subsystem}, context: {context}{peer_msg}") - try: - ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client, nqn=request.subsystem) - self.logger.debug(f"list_hosts: {ret}") - except Exception as ex: - errmsg = "Failure listing hosts, can't get subsystems" - self.logger.exception(errmsg) - errmsg = f"{errmsg}:\n{ex}" - resp = self.parse_json_exeption(ex) - status = errno.EINVAL - if resp: - status = resp["code"] - errmsg = f"Failure listing hosts, can't get subsystems: {resp['message']}" - return pb2.hosts_info(status=status, error_message=errmsg, hosts=[]) + ret = None + if not context: + ret = self.subsystems_cache.get_one_subsystem(request.subsystem) + self.logger.debug(f"list_hosts subsystem (cache): {ret}") + if not ret: + try: + ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client, nqn=request.subsystem) + self.logger.debug(f"list_hosts: {ret}") + except Exception as ex: + errmsg = "Failure listing hosts, can't get subsystems" + self.logger.exception(errmsg) + errmsg = f"{errmsg}:\n{ex}" + resp = self.parse_json_exeption(ex) + status = errno.EINVAL + if resp: + status = resp["code"] + errmsg = f"Failure listing hosts, can't get subsystems: {resp['message']}" + return pb2.hosts_info(status=status, error_message=errmsg, hosts=[]) + + parsed_ret = [] + for s in ret: + subsys = pb2.subsystem() + try: + json_format.Parse(json.dumps(s), subsys, ignore_unknown_fields=True) + except Exception: + self.logger.exception(f"Failure listing hosts, can't parse subsystem {s}") + return pb2.hosts_info(status=errno.EINVAL, + error_message="Failure listing hosts, " + "can't parse subsystem", + hosts=[]) + parsed_ret.append(subsys) + ret = parsed_ret + if not ret: + ret = [] hosts = [] allow_any_host = False for s in ret: try: - if s["nqn"] != request.subsystem: - self.logger.warning(f'Got subsystem {s["nqn"]} instead of ' + if s.nqn != request.subsystem: + self.logger.warning(f'Got subsystem {s.nqn} instead of ' f'{request.subsystem}, ignore') continue try: - allow_any_host = s["allow_any_host"] - host_nqns = s["hosts"] + allow_any_host = s.allow_any_host + host_nqns = s.hosts except Exception: host_nqns = [] pass for h in host_nqns: - host_nqn = h["nqn"] + host_nqn = h.nqn psk = self.host_info.is_psk_host(request.subsystem, host_nqn) dhchap = self.host_info.is_dhchap_host(request.subsystem, host_nqn) one_host = pb2.host(nqn=host_nqn, use_psk=psk, use_dhchap=dhchap) @@ -4008,36 +4173,60 @@ def list_connections_safe(self, request, context): errmsg = f"Failure listing connections, can't get controllers: {resp['message']}" return pb2.connections_info(status=status, error_message=errmsg, connections=[]) - try: - subsys_ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client, nqn=request.subsystem) - self.logger.debug(f"list_connections subsystems: {subsys_ret}") - except Exception as ex: - errmsg = "Failure listing connections, can't get subsystems" - self.logger.exception(errmsg) - errmsg = f"{errmsg}:\n{ex}" - resp = self.parse_json_exeption(ex) - status = errno.EINVAL - if resp: - status = resp["code"] - errmsg = f"Failure listing connections, can't get subsystems: {resp['message']}" - return pb2.connections_info(status=status, error_message=errmsg, connections=[]) + subsys_ret = None + if not context: + subsys_ret = self.subsystems_cache.get_one_subsystem(request.subsystem) + self.logger.debug(f"list_connections subsystems (cache): {subsys_ret}") + if not subsys_ret: + try: + subsys_ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client, + nqn=request.subsystem) + self.logger.debug(f"list_connections subsystems: {subsys_ret}") + except Exception as ex: + errmsg = "Failure listing connections, can't get subsystems" + self.logger.exception(errmsg) + errmsg = f"{errmsg}:\n{ex}" + resp = self.parse_json_exeption(ex) + status = errno.EINVAL + if resp: + status = resp["code"] + errmsg = f"Failure listing connections, can't get subsystems: {resp['message']}" + return pb2.connections_info(status=status, error_message=errmsg, connections=[]) + + parsed_ret = [] + for s in subsys_ret: + subsys = pb2.subsystem() + try: + json_format.Parse(json.dumps(s), subsys, ignore_unknown_fields=True) + except Exception: + self.logger.exception(f"Failure listing connections, " + f"can't parse subsystem {s}") + return pb2.connections_info(status=errno.EINVAL, + error_message="Failure listing connections, " + "can't parse subsystem", + connections=[]) + parsed_ret.append(subsys) + subsys_ret = parsed_ret + + if not subsys_ret: + subsys_ret = [] connections = [] host_nqns = [] for s in subsys_ret: try: - if s["nqn"] != request.subsystem: - self.logger.warning(f'Got subsystem {s["nqn"]} instead of ' + if s.nqn != request.subsystem: + self.logger.warning(f'Got subsystem {s.nqn} instead of ' f'{request.subsystem}, ignore') continue try: - subsys_hosts = s["hosts"] + subsys_hosts = s.hosts except Exception: subsys_hosts = [] pass for h in subsys_hosts: try: - host_nqns.append(h["nqn"]) + host_nqns.append(h.nqn) except Exception: pass break @@ -4163,6 +4352,12 @@ def create_listener_safe(self, request, context): self.logger.error(errmsg) return pb2.req_status(status=errno.EINVAL, error_message=errmsg) + # If this is not set the subsystem was not created yet + if request.nqn not in self.subsys_serial: + errmsg = f"{create_listener_error_prefix}: can't find subsystem {request.nqn}" + self.logger.error(errmsg) + return pb2.req_status(status=errno.ENOENT, error_message=errmsg) + if not GatewayState.is_key_element_valid(request.host_name): errmsg = f"{create_listener_error_prefix}: Host name " \ f"\"{request.host_name}\" contains invalid characters" @@ -4212,7 +4407,7 @@ def create_listener_safe(self, request, context): return pb2.req_status(status=errno.EEXIST, error_message=errmsg) if self.verify_listener_ip: - nics = NICS(True) + nics = NICS(self.logger, True) if not nics.verify_ip_address(traddr, adrfam): for dev in nics.adapters.values(): self.logger.debug(f"NIC: {dev}") @@ -4332,9 +4527,11 @@ def remove_listener_from_state(self, nqn, host_name, traddr, port, context): if not context: return pb2.req_status(status=0, error_message=os.strerror(0)) - if context: - assert self.omap_lock.locked(), "OMAP is unlocked when calling " \ - "remove_listener_from_state()" + assert context is None or self.omap_lock.write_locked_by_me(), \ + f"OMAP is unlocked when calling remove_listener_from_state()\n" \ + f"in thread: {threading.get_native_id()}. Locked by: " \ + f"{self.omap_lock.locked_by}, with cookie: {self.omap_lock.lock_cookie}, " \ + f"locked: {self.omap_lock.is_exclusively_locked}" host_name = host_name.strip() listener_hosts = [] @@ -4434,7 +4631,7 @@ def delete_listener_safe(self, request, context): errmsg = f"{delete_listener_error_prefix}: There are active connections for " \ f"{esc_traddr}:{request.trsvcid}. Deleting the listener terminates " \ f"active connections. You can continue to delete the listener by " \ - f"adding the `--force` parameter." + f"adding the \"--force\" parameter." self.logger.error(errmsg) return pb2.req_status(status=errno.ENOTEMPTY, error_message=errmsg) @@ -4475,7 +4672,7 @@ def delete_listener_safe(self, request, context): else: errmsg = f"{delete_listener_error_prefix}: Gateway's host name must " \ f"match current host ({self.host_name}). You can continue to " \ - f"delete the listener by adding the `--force` parameter." + f"delete the listener by adding the \"--force\" parameter." self.logger.error(errmsg) return pb2.req_status(status=errno.ENOENT, error_message=errmsg) except Exception as ex: @@ -4645,6 +4842,8 @@ def show_gateway_listeners_info(self, request, context=None): def list_subsystems_safe(self, request, context): """List subsystems.""" + assert self.spdk_rpc_subsystems_lock.locked(), "Subsystems RPC is unlocked when calling " \ + "list_subsystems_safe()" peer_msg = self.get_peer_message(context) log_level = logging.INFO if context else logging.DEBUG if request.subsystem_nqn: @@ -4662,11 +4861,13 @@ def list_subsystems_safe(self, request, context): f"{context}{peer_msg}") subsystems = [] + cache_subsystems = [] try: if request.subsystem_nqn: - ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client, nqn=request.subsystem_nqn) + ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_subsystems_client, + nqn=request.subsystem_nqn) else: - ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_client) + ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_subsystems_client) self.logger.debug(f"list_subsystems: {ret}") except Exception as ex: errmsg = "Failure listing subsystems" @@ -4679,11 +4880,10 @@ def list_subsystems_safe(self, request, context): errmsg = f"Failure listing subsystems: {resp['message']}" return pb2.subsystems_info_cli(status=status, error_message=errmsg, subsystems=[]) + if not ret: + ret = [] for s in ret: try: - if request.serial_number: - if s["serial_number"] != request.serial_number: - continue if s["subtype"] == "NVMe": ns_count = len(s["namespaces"]) if not ns_count: @@ -4693,6 +4893,15 @@ def list_subsystems_safe(self, request, context): s["has_dhchap_key"] = self.host_info.does_subsystem_have_dhchap_key(s["nqn"]) s["created_without_key"] = \ self.host_info.was_subsystem_created_without_key(s["nqn"]) + for n in s["namespaces"]: + bdev = n["bdev_name"] + with self.shared_state_lock: + nonce = self.cluster_nonce[self.bdev_cluster[bdev]] + n["nonce"] = nonce + find_ret = self.subsystem_nsid_bdev_and_uuid.find_namespace( + s["nqn"], n["nsid"]) + n["auto_visible"] = find_ret.auto_visible + n["hosts"] = find_ret.host_list else: s["namespace_count"] = 0 s["enable_ha"] = False @@ -4700,59 +4909,29 @@ def list_subsystems_safe(self, request, context): # Parse the JSON dictionary into the protobuf message subsystem = pb2.subsystem_cli() json_format.Parse(json.dumps(s), subsystem, ignore_unknown_fields=True) - subsystems.append(subsystem) + if not request.serial_number or s["serial_number"] == request.serial_number: + subsystems.append(subsystem) + if not request.subsystem_nqn: + cache_subsystem = pb2.subsystem() + json_format.Parse(json.dumps(s), cache_subsystem, ignore_unknown_fields=True) + cache_subsystems.append(cache_subsystem) except Exception: self.logger.exception(f"{s=} parse error") pass + # Only set cache if we've listed all subsystems + if not request.subsystem_nqn: + self.subsystems_cache.set_subsystems(pb2.subsystems_info(subsystems=cache_subsystems)) + return pb2.subsystems_info_cli(status=0, error_message=os.strerror(0), subsystems=subsystems) - def get_subsystems_safe(self, request, context): - """Gets subsystems.""" - - peer_msg = self.get_peer_message(context) - self.logger.debug(f"Received request to get subsystems, context: {context}{peer_msg}") - subsystems = [] - try: - ret = rpc_nvmf.nvmf_get_subsystems(self.spdk_rpc_subsystems_client) - except Exception as ex: - self.logger.exception("get_subsystems failed") - if context: - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details(f"{ex}") - return pb2.subsystems_info() - - for s in ret: - try: - s["has_dhchap_key"] = self.host_info.does_subsystem_have_dhchap_key(s["nqn"]) - ns_key = "namespaces" - if ns_key in s: - for n in s[ns_key]: - bdev = n["bdev_name"] - with self.shared_state_lock: - nonce = self.cluster_nonce[self.bdev_cluster[bdev]] - n["nonce"] = nonce - find_ret = self.subsystem_nsid_bdev_and_uuid.find_namespace(s["nqn"], - n["nsid"]) - n["auto_visible"] = find_ret.auto_visible - n["hosts"] = find_ret.host_list - # Parse the JSON dictionary into the protobuf message - subsystem = pb2.subsystem() - json_format.Parse(json.dumps(s), subsystem, ignore_unknown_fields=True) - subsystems.append(subsystem) - except Exception: - self.logger.exception(f"{s=} parse error") - pass - - return pb2.subsystems_info(subsystems=subsystems) - def get_subsystems(self, request, context): - with self.spdk_rpc_subsystems_lock: - return self.get_subsystems_safe(request, context) + return self.subsystems_cache.get_subsystems() def list_subsystems(self, request, context=None): - return self.execute_grpc_function(self.list_subsystems_safe, request, context) + with self.spdk_rpc_subsystems_lock: + return self.list_subsystems_safe(request, context) def change_subsystem_key_safe(self, request, context): """Change subsystem key.""" diff --git a/control/prometheus.py b/control/prometheus.py index b93bb650f3f..fb06d0dcfe7 100644 --- a/control/prometheus.py +++ b/control/prometheus.py @@ -210,11 +210,17 @@ def _get_connection_map(self, subsystem_list): def _get_data(self): """Gather data from the SPDK""" self.bdev_info = self._get_bdev_info() + logger.debug("Done with _get_bdev_info()") self.bdev_io_stats = self._get_bdev_io_stats() + logger.debug("Done with _get_bdev_io_stats()") self.spdk_thread_stats = self._get_spdk_thread_stats() + logger.debug("Done with _get_spdk_thread_stats()") self.subsystems = self._get_subsystems() + logger.debug("Done with _get_subsystems()") self.subsystems_cli = self._list_subsystems() + logger.debug("Done with _list_subsystems()") self.connections = self._get_connection_map(self.subsystems) + logger.debug("Done with _get_connection_map()") @ttl def collect(self): @@ -230,10 +236,11 @@ def collect(self): elapsed = sum(self.method_timings.values()) if elapsed > self.interval: - logger.error(f"Stats refresh time > interval time of {self.interval} secs") + logger.error(f"Stats refresh time {elapsed:.3f} > interval time of " + f"{self.interval} secs") elif elapsed > self.interval * COLLECTION_ELAPSED_WARNING: logger.warning(f"Stats refresh of {elapsed:.2f}s is close to exceeding " - f"the interval {self.interval}s") + f"the interval {self.interval} secs") else: logger.debug(f"Stats refresh completed in {elapsed:.3f} secs.") diff --git a/control/proto/gateway.proto b/control/proto/gateway.proto index e0b6e427c6f..410b0688f22 100644 --- a/control/proto/gateway.proto +++ b/control/proto/gateway.proto @@ -202,6 +202,7 @@ message namespace_add_host_req { string subsystem_nqn = 1; uint32 nsid = 2; string host_nqn = 3; + optional bool force = 4; } message namespace_delete_host_req { diff --git a/control/rebalance.py b/control/rebalance.py index c1d6f9fea4b..e5409876731 100755 --- a/control/rebalance.py +++ b/control/rebalance.py @@ -11,13 +11,13 @@ import time from .proto import gateway_pb2 as pb2 -MIN_LOAD = 2000 - class Rebalance: """Miscellaneous functions which do rebalance of ANA groups """ + INVALID_LOAD_BALANCING_GROUP = 18446744073709551616 # should be bigger than any valid NSID + def __init__(self, gateway_service): self.logger = gateway_service.logger self.gw_srv = gateway_service @@ -40,7 +40,11 @@ def __init__(self, gateway_service): def auto_rebalance_task(self, death_event): """Periodically calls for auto rebalance.""" + self.logger.debug(f"Rebalance thread id is {self.auto_rebalance.native_id}") while (self.rebalance_period_sec > 0): + while self.gw_srv.gateway_state.update_is_active_lock.locked(): + time.sleep(0.5) # wait until update is over + for i in range(self.rebalance_max_ns_to_change_lb_grp): try: rc = self.gw_srv.execute_grpc_function(self.rebalance_logic, None, "context") @@ -56,7 +60,7 @@ def auto_rebalance_task(self, death_event): time.sleep(self.rebalance_period_sec) def find_min_loaded_group(self, grp_list) -> int: - min_load = MIN_LOAD + min_load = Rebalance.INVALID_LOAD_BALANCING_GROUP chosen_ana_group = 0 chosen_nqn = "null" for ana_grp in self.gw_srv.ana_grp_ns_load: @@ -66,7 +70,7 @@ def find_min_loaded_group(self, grp_list) -> int: if self.gw_srv.ana_grp_ns_load[ana_grp] <= min_load: min_load = self.gw_srv.ana_grp_ns_load[ana_grp] chosen_ana_group = ana_grp - min_load = MIN_LOAD + min_load = Rebalance.INVALID_LOAD_BALANCING_GROUP self.logger.debug(f"chosen ana-group {chosen_ana_group}") if chosen_ana_group != 0: for nqn in self.gw_srv.ana_grp_subs_load[chosen_ana_group]: @@ -78,23 +82,39 @@ def find_min_loaded_group(self, grp_list) -> int: return chosen_ana_group, chosen_nqn def find_min_loaded_group_in_subsys(self, nqn, grp_list) -> int: - min_load = MIN_LOAD + min_load = Rebalance.INVALID_LOAD_BALANCING_GROUP chosen_ana_group = 0 + min_groups = set() for ana_grp in grp_list: if self.gw_srv.ana_grp_ns_load[ana_grp] == 0: self.gw_srv.ana_grp_subs_load[ana_grp][nqn] = 0 + self.logger.debug(f"chosen ana_grp {ana_grp}, min load = {0}") return 0, ana_grp for ana_grp in self.gw_srv.ana_grp_subs_load: if ana_grp in grp_list: if nqn in self.gw_srv.ana_grp_subs_load[ana_grp]: - if self.gw_srv.ana_grp_subs_load[ana_grp][nqn] <= min_load: + if self.gw_srv.ana_grp_subs_load[ana_grp][nqn] < min_load: min_load = self.gw_srv.ana_grp_subs_load[ana_grp][nqn] - chosen_ana_group = ana_grp + min_groups = {ana_grp} + elif self.gw_srv.ana_grp_subs_load[ana_grp][nqn] == min_load: + min_groups.add(ana_grp) else: # still no load on this ana and subs - chosen_ana_group = ana_grp - self.gw_srv.ana_grp_subs_load[chosen_ana_group][nqn] = 0 - min_load = 0 - break + self.gw_srv.ana_grp_subs_load[ana_grp][nqn] = 0 + if self.gw_srv.ana_grp_subs_load[ana_grp][nqn] < min_load: + min_load = 0 + min_groups = {ana_grp} + elif self.gw_srv.ana_grp_subs_load[ana_grp][nqn] == min_load: + min_groups.add(ana_grp) + min_load = Rebalance.INVALID_LOAD_BALANCING_GROUP + for ana_grp in min_groups: + # chose the minimum loaded ana group from the ana groups in min_groups set + self.logger.debug(f"pass min_grops set: ana_grp {ana_grp} " + f"load {self.gw_srv.ana_grp_ns_load[ana_grp]}") + # find minimum loaded self.gw_srv.ana_grp_ns_load + if self.gw_srv.ana_grp_ns_load[ana_grp] < min_load: + min_load = self.gw_srv.ana_grp_ns_load[ana_grp] + self.logger.debug(f"chosen ana_grp {ana_grp}, min load = {min_load}") + chosen_ana_group = ana_grp return min_load, chosen_ana_group # 1. Not allowed to perform regular rebalance when scale_down rebalance is ongoing @@ -104,12 +124,17 @@ def find_min_loaded_group_in_subsys(self, nqn, grp_list) -> int: # index of ANA group that is currently responsible for rebalance def rebalance_logic(self, request, context) -> int: now = time.time() + rebalance_attr = () + grps_list = self.ceph_utils.get_number_created_gateways(self.gw_srv.gateway_pool, + self.gw_srv.gateway_group, False) worker_ana_group = self.ceph_utils.get_rebalance_ana_group() self.logger.debug(f"Called rebalance logic: current rebalancing ana " f"group {worker_ana_group}") + if worker_ana_group == 0: + self.logger.info(f"Auto rebalance is not supported - index {worker_ana_group}") + return 1 ongoing_scale_down_rebalance = False - grps_list = self.ceph_utils.get_number_created_gateways(self.gw_srv.gateway_pool, - self.gw_srv.gateway_group, False) + invalid_ana_group = 0 if not self.ceph_utils.is_rebalance_supported(): self.logger.info("Auto rebalance is not supported with the curent ceph version") return 1 @@ -122,7 +147,7 @@ def rebalance_logic(self, request, context) -> int: self.logger.info(f"Scale-down rebalance is ongoing for ANA group {ana_grp} " f"current load {self.gw_srv.ana_grp_ns_load[ana_grp]}") self.last_scale_down_ts = now - break + invalid_ana_group = ana_grp num_active_ana_groups = len(grps_list) for ana_grp in self.gw_srv.ana_grp_state: if self.gw_srv.ana_grp_state[ana_grp] == pb2.ana_state.OPTIMIZED: @@ -171,12 +196,13 @@ def rebalance_logic(self, request, context) -> int: f"nqn {nqn} ") min_load, min_ana_grp = \ self.find_min_loaded_group_in_subsys(nqn, grps_list) - le_target = \ - (self.gw_srv.ana_grp_subs_load[min_ana_grp][nqn] + 1) <= \ - target_subs_per_ana - load_eq = (self.gw_srv.ana_grp_subs_load[min_ana_grp][nqn] + 1) == \ - (self.gw_srv.ana_grp_subs_load[ana_grp][nqn] - 1) - if le_target or load_eq: + + my_eq_more = (self.gw_srv.ana_grp_subs_load[ana_grp][nqn] - 1) >= \ + (self.gw_srv.ana_grp_subs_load[min_ana_grp][nqn] + 1) + + worth = (self.gw_srv.ana_grp_ns_load[ana_grp] - # noqa: W504 + self.gw_srv.ana_grp_ns_load[min_ana_grp] > 1) # noqa: W504 + if my_eq_more: self.logger.info(f"Start rebalance (regular) in subsystem " f"{nqn}, dest ana {min_ana_grp}, dest ana " f"load per subs {min_load}") @@ -184,21 +210,36 @@ def rebalance_logic(self, request, context) -> int: self.ns_rebalance(context, ana_grp, min_ana_grp, 1, nqn) return 0 else: + # add to tuple : ana , min-ana , nqn , worth + if worth: + rebalance_attr = (ana_grp, min_ana_grp, nqn, worth) self.logger.debug(f"Found min loaded subsystem {nqn}, ana " f"{min_ana_grp}, load {min_load} does not " f"fit rebalance criteria!") continue if ongoing_scale_down_rebalance and (num_active_ana_groups == self.ceph_utils.num_gws): # this GW feels scale_down condition on ana_grp but no GW in Deleting - # state in the current mon.map . Experimental code - just for logs - self.logger.info(f"Seems like scale-down deadlock on group {ana_grp}") + # state in the current mon.map. So need to change LB group for all NS + # related to the invalid group - group that was deleted by GW monitor + self.logger.info(f"Detected deleted LB group {invalid_ana_group}") if (self.gw_srv.ana_grp_state[worker_ana_group]) == pb2.ana_state.OPTIMIZED: min_ana_grp, chosen_nqn = self.find_min_loaded_group(grps_list) - if chosen_nqn != "null": + if chosen_nqn != "null" and invalid_ana_group != 0: self.logger.info(f"Start rebalance (deadlock resolving) dest. ana group" f" {min_ana_grp}, subsystem {chosen_nqn}") - # self.ns_rebalance(context, ana_grp, min_ana_grp, 1, "0") + self.ns_rebalance(context, invalid_ana_group, min_ana_grp, 1, "0") return 0 + else: + self.logger.info(f"rebalance (deadlock resolving) is not allowed " + f" invalid group {invalid_ana_group}," + f" subsystem {chosen_nqn}") + # if tuple is not empty + if rebalance_attr: + self.logger.info( + f"Start rebalance (fixing regular) in subsystem " + f"ana {rebalance_attr[0]}, dest ana {rebalance_attr[1]} nqn {rebalance_attr[2]}") + self.ns_rebalance(context, rebalance_attr[0], rebalance_attr[1], 1, rebalance_attr[2]) + return 0 return 1 def ns_rebalance(self, context, ana_id, dest_ana_id, num, subs_nqn) -> int: @@ -215,6 +256,10 @@ def ns_rebalance(self, context, ana_id, dest_ana_id, num, subs_nqn) -> int: f"{subsys}, anagrpid: {ana_id}") change_lb_group_req = pb2.namespace_change_load_balancing_group_req( subsystem_nqn=subsys, nsid=nsid, anagrpid=dest_ana_id, auto_lb_logic=True) + if not self.gw_srv.up_and_running: + self.logger.warning("SPDK is not up and running!") + return 0 + ret = self.gw_srv.namespace_change_load_balancing_group_safe(change_lb_group_req, context) self.logger.debug(f"ret namespace_change_load_balancing_group {ret}") diff --git a/control/server.py b/control/server.py index ce908a31b7a..3ec1b4d43ad 100644 --- a/control/server.py +++ b/control/server.py @@ -23,6 +23,7 @@ import spdk.rpc.client as rpc_client import spdk.rpc.nvmf as rpc_nvmf import spdk.rpc.iobuf as rpc_iobuf +import spdk.rpc.sock as rpc_sock from .proto import gateway_pb2 as pb2 from .proto import gateway_pb2_grpc as pb2_grpc @@ -99,6 +100,9 @@ class GatewayServer: discovery_pid: Subprocess running Ceph nvmeof discovery service """ + MAX_TIME_TO_WAIT_FOR_GATEWAY_EXIT = 30 + SPDK_PING_INTERVAL_DEFAULT = 2.0 + def __init__(self, config: GatewayConfig): self.config = config self.gw_logger_object = GatewayLogger(self.config) @@ -121,6 +125,8 @@ def __init__(self, config: GatewayConfig): self.omap_state = None self.omap_lock = None self.crypto = None + self.gateway_state = None + self.exiting = False enc_key = None enc_key_file = self.config.get_with_default("gateway", "encryption_key", "") if enc_key_file: @@ -140,6 +146,9 @@ def __init__(self, config: GatewayConfig): self.name = self.config.get("gateway", "name") if not self.name: self.name = socket.gethostname() + self.system_exit_message = None + self.system_exit_message_lock = threading.Lock() + self.gateway_exit_started = threading.Event() self.logger.info(f"Starting gateway {self.name}") def __enter__(self): @@ -147,16 +156,37 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): """Cleans up SPDK and server instances.""" + + gw_logger = self.gw_logger_object + logger = None + if gw_logger: + logger = gw_logger.logger + + normalExit = False + if exc_type is None and exc_value is None: + normalExit = True + elif isinstance(exc_value, SystemExit) and isinstance(exc_value.code, int): + normalExit = exc_value.code == 0 + if self.gateway_rpc: self.gateway_rpc.up_and_running = False - if exc_type is not None: - self.logger.exception("GatewayServer exception occurred:\n{traceback}\n") - else: - self.logger.info("GatewayServer is terminating gracefully...") - + if self.gateway_state: + self.gateway_state.up_and_running = False + if self.gateway_state.omap: + self.gateway_state.omap.up_and_running = False gw_name = self.name - gw_logger = self.gw_logger_object - logger = gw_logger.logger + if self.exiting: + if logger: + logger.debug("Already exiting, do nothing") + return normalExit + self.exiting = True + + if logger: + if normalExit: + logger.info("GatewayServer is terminating gracefully...") + else: + logger.exception("GatewayServer exception occurred") + signal.signal(signal.SIGCHLD, signal.SIG_IGN) if self.monitor_client_process: self._stop_monitor_client() @@ -202,7 +232,11 @@ def __exit__(self, exc_type, exc_value, traceback): if logger: logger.info("Exiting the gateway process.") - gw_logger.compress_final_log_file(gw_name) + + if gw_logger and gw_name: + gw_logger.compress_final_log_file(gw_name) + + return normalExit def set_group_id(self, id: int): self.logger.info(f"Gateway {self.name} group {id=}") @@ -241,10 +275,10 @@ def serve(self): self.logger.info(f"Starting serve, monitor client version: " f"{self._monitor_client_version()}") - omap_state = OmapGatewayState(self.config, f"gateway-{self.name}") + omap_state = OmapGatewayState(self.config, self.set_gateway_exit_message, + f"gateway-{self.name}") self.omap_state = omap_state local_state = LocalGatewayState() - omap_state.check_for_old_format_omap_files() # install SIGCHLD handler signal.signal(signal.SIGCHLD, sigchld_handler) @@ -258,24 +292,25 @@ def serve(self): self.ceph_utils = CephUtils(self.config) # Start SPDK - self._start_spdk(omap_state) + self._start_spdk() # Start discovery service self._start_discovery_service() # Register service implementation with server - gateway_state = GatewayStateHandler(self.config, local_state, omap_state, - self.gateway_rpc_caller, self.crypto, - f"gateway-{self.name}") - self.omap_lock = OmapLock(omap_state, gateway_state, self.rpc_lock) - self.gateway_rpc = GatewayService(self.config, gateway_state, self.rpc_lock, + self.gateway_state = GatewayStateHandler(self.config, local_state, omap_state, + self.gateway_rpc_caller, self.crypto, + f"gateway-{self.name}") + self.omap_lock = OmapLock(self.gateway_state, self.rpc_lock) + self.gateway_rpc = GatewayService(self.config, self.gateway_state, self.rpc_lock, self.omap_lock, self.group_id, self.spdk_rpc_client, - self.spdk_rpc_subsystems_client, self.ceph_utils) + self.spdk_rpc_subsystems_client, self.ceph_utils, + self.set_gateway_exit_message) self.server = self._grpc_server(self._gateway_address()) pb2_grpc.add_GatewayServicer_to_server(self.gateway_rpc, self.server) # Check for existing NVMeoF target state - gateway_state.start_update() + self.gateway_state.start_update() # Start server self.server.start() @@ -400,9 +435,16 @@ def _start_discovery_service(self): self.discovery_pid = os.fork() if self.discovery_pid == 0: self.logger.info("Starting ceph nvmeof discovery service") - with DiscoveryService(self.config) as discovery: - discovery.start_service() - os._exit(0) + # disable inherited from gateway signal handlers + signal.signal(signal.SIGCHLD, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + try: + with DiscoveryService(self.config) as discovery: + discovery.start_service() + except Exception: + self.logger.exception("Exception occurred while running the discovery service") + finally: + os._exit(0) else: self.logger.info(f"Discovery service process id: {self.discovery_pid}") @@ -496,12 +538,11 @@ def handle_process_output_file(self, log_file_dir, prefix): return log_file_path - def _start_spdk(self, omap_state): + def _start_spdk(self): """Starts SPDK process.""" # Start target self.logger.debug(f"Configuring server {self.name}") - waiting_for_rpc = False spdk_tgt_path = self.config.get("spdk", "tgt_path") self.logger.info(f"SPDK Target Path: {spdk_tgt_path}") sockdir = self.config.get_with_default("spdk", "rpc_socket_dir", "/var/tmp/") @@ -523,15 +564,12 @@ def _start_spdk(self, omap_state): self.logger.info(f"SPDK Socket: {self.spdk_rpc_socket_path}") spdk_tgt_cmd_extra_args = self.config.get_with_default( "spdk", "tgt_cmd_extra_args", "") - cmd = [spdk_tgt_path, "-u", "-r", self.spdk_rpc_socket_path] + cmd = [spdk_tgt_path, "--wait-for-rpc", "-u", "-r", self.spdk_rpc_socket_path] iobuf_options = self.config.get_with_default("spdk", "iobuf_options", "") max_subsystems = self.config.getint_with_default("gateway", "max_subsystems", GatewayService.MAX_SUBSYSTEMS_DEFAULT) - if iobuf_options or max_subsystems > 0: - waiting_for_rpc = True - cmd += ["--wait-for-rpc"] # Add extra args from the conf file if spdk_tgt_cmd_extra_args: @@ -611,16 +649,12 @@ def _start_spdk(self, omap_state): if iobuf_options: self._initialize_iobuf_options(iobuf_options) - if waiting_for_rpc: - self._initialize_framework() + # Set SSL tickets for ssl sock implemtation + self._set_num_ssl_tickets(0) + + # Notice that some SPDK calls can't be made after framework init + self._initialize_framework() - self.spdk_rpc_ping_client = rpc_client.JSONRPCClient( - self.spdk_rpc_socket_path, - None, - timeout, - log_level=protocol_log_level, - conn_retries=conn_retries, - ) self.spdk_rpc_subsystems_client = rpc_client.JSONRPCClient( self.spdk_rpc_socket_path, None, @@ -711,18 +745,54 @@ def _stop_spdk(self): self.logger.exception(f"An error occurred while removing RPC " f"socket {self.spdk_rpc_socket_path}") + def _terminate_discovery(self, pid): + def is_running(pid): + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + self.logger.exception(f"Permission denied when checking status of discovery {pid}") + return True + + def wait_for_exit(pid): + WAIT_INTERVAL_SEC = 0.1 + MAX_WAIT_ATTEMPTS = 10 + for _ in range(MAX_WAIT_ATTEMPTS): + if not is_running(pid): + return True + time.sleep(WAIT_INTERVAL_SEC) + return False + + try: + # discovery service selector loop should exit + # due to KeyboardInterrupt exception on SIGINT signal + signals = [signal.SIGINT, signal.SIGTERM, signal.SIGKILL] + for sig in signals: + self.logger.info(f"Sending signal {sig.name} to Discovery service process {pid}") + os.kill(pid, sig) + if wait_for_exit(pid): + return True + self.logger.warning( + f"Discovery service process {pid} did not exit after all signals." + ) + return False + + except ProcessLookupError: + self.logger.info(f"Discovery service process {pid} already exited.") + return True + except Exception: + self.logger.exception(f"Error terminating discovery service process {pid}") + return False + def _stop_discovery(self): """Stops Discovery service process.""" assert self.discovery_pid is not None # should be verified by the caller self.logger.info("Terminating discovery service...") - # discovery service selector loop should exit due to KeyboardInterrupt exception - try: - os.kill(self.discovery_pid, signal.SIGINT) - os.waitpid(self.discovery_pid, 0) - except (ChildProcessError, ProcessLookupError): - pass # ignore - self.logger.info("Discovery service terminated") + if self._terminate_discovery(self.discovery_pid): + self.logger.info("Discovery service terminated") self.discovery_pid = None @@ -753,6 +823,17 @@ def _initialize_iobuf_options(self, options): self.logger.exception("IObuf set options returned with error") pass + def _set_num_ssl_tickets(self, tickets_number=0): + """Set SSL tickets number for ssl socket implementation.""" + + try: + rpc_sock.sock_impl_set_options(self.spdk_rpc_client, + impl_name="ssl", + num_ssl_tickets=tickets_number) + except Exception: + self.logger.exception("sock_impl_set_options returned with error") + pass + def _initialize_framework(self): """In case we started SPDK with the "wait for rpc" flag, we need to call this""" @@ -783,6 +864,20 @@ def _create_transport(self, trtype): self.logger.exception(f"Create Transport {trtype} returned with error") raise + def set_gateway_exit_message(self, msg): + with self.system_exit_message_lock: + self.system_exit_message = msg + return self.gateway_exit_started.wait(GatewayServer.MAX_TIME_TO_WAIT_FOR_GATEWAY_EXIT) + + def exit_gateway_if_needed(self): + exit_msg = None + with self.system_exit_message_lock: + exit_msg = self.system_exit_message + if exit_msg is not None: + self.logger.error(f"System exit message was set to {exit_msg}") + self.gateway_exit_started.set() + raise SystemExit(exit_msg) + def keep_alive(self): """Continuously confirms communication with SPDK process.""" allowed_consecutive_spdk_ping_failures = self.config.getint_with_default( @@ -792,7 +887,7 @@ def keep_alive(self): spdk_ping_interval_in_seconds = self.config.getfloat_with_default( "gateway", "spdk_ping_interval_in_seconds", - 2.0) + GatewayServer.SPDK_PING_INTERVAL_DEFAULT) if spdk_ping_interval_in_seconds < 0.0: self.logger.warning(f"Invalid SPDK ping interval " f"{spdk_ping_interval_in_seconds}, will reset to 0") @@ -806,6 +901,7 @@ def keep_alive(self): spdk_ping_interval_in_seconds = 0.0 while True: + self.exit_gateway_if_needed() if self.gateway_rpc: if self.gateway_rpc.rebalance.rebalance_event.is_set(): self.logger.critical("Failure in rebalance, aborting") @@ -813,8 +909,7 @@ def keep_alive(self): timedout = self.server.wait_for_termination(timeout=1) if not timedout: break - if spdk_ping_interval_in_seconds > 0.0: - time.sleep(spdk_ping_interval_in_seconds) + time.sleep(spdk_ping_interval_in_seconds) alive = self._ping() if not alive: consecutive_ping_failures += 1 @@ -831,10 +926,10 @@ def keep_alive(self): def _ping(self): """Confirms communication with SPDK process.""" try: - spdk.rpc.spdk_get_version(self.spdk_rpc_ping_client) - return True + ret = self.gateway_rpc.list_subsystems(pb2.list_subsystems_req()) + return ret.status == 0 except Exception: - self.logger.exception("spdk_get_version failed") + self.logger.exception("Failure in list_subsystems()") return False def probe_huge_pages(self): @@ -885,9 +980,25 @@ def probe_huge_pages(self): else: self.logger.warning(f"Can't find huge pages file {hugepages_file}") - def gateway_rpc_caller(self, requests, is_add_req): + def _sleep_if_needed(self, interval, start): + if interval <= 0: + return None + + if not start: + start = time.monotonic() + + if time.monotonic() - start >= interval: + self.logger.debug("Will sleep and let other threads work") + time.sleep(0) + start = time.monotonic() + + return start + + def gateway_rpc_caller(self, requests, is_add_req, break_interval): """Passes RPC requests to gateway service.""" + start_time = 0 for key, val in requests.items(): + start_time = self._sleep_if_needed(break_interval, start_time) if key.startswith(GatewayState.SUBSYSTEM_PREFIX): if is_add_req: req = json_format.Parse(val, pb2.create_subsystem_req(), diff --git a/control/state.py b/control/state.py index c21dab92f78..6b78ace76c8 100644 --- a/control/state.py +++ b/control/state.py @@ -11,6 +11,7 @@ import threading import rados import errno +import os import contextlib from typing import Dict from collections import defaultdict @@ -306,14 +307,25 @@ def __exit__(self, exc_type, exc_value, traceback): class OmapLock: OMAP_FILE_LOCK_NAME = "omap_file_lock" - OMAP_FILE_LOCK_COOKIE = "omap_file_cookie" - - def __init__(self, omap_state, gateway_state, rpc_lock: threading.Lock) -> None: - self.logger = omap_state.logger - self.omap_state = omap_state + OMAP_FILE_LOCK_COOKIE_PREFIX = "omap_file_cookie" + EXCLUSIVE_LOCK_NAME = "exclusive" + SHARED_LOCK_NAME = "shared" + + changes_lock = threading.Lock() + no_read_lock_warning_displayed = False + ignore_errors_warning_displayed = False + is_exclusively_locked = False + locked_by = {} + lock_cookie = [] + + def __init__(self, gateway_state, rpc_lock: threading.Lock) -> None: + self.logger = gateway_state.logger + self.omap_state = gateway_state.omap + self.omap_state.omap_lock = self self.gateway_state = gateway_state self.rpc_lock = rpc_lock - self.is_locked = False + self.logger.debug(f"Init OMAP lock, cookie: {self.build_omap_lock_cookie()}, thread: " + f"{threading.get_native_id()}, self: {hex(id(self))}") self.omap_file_lock_duration = self.omap_state.config.getint_with_default( "gateway", "omap_file_lock_duration", @@ -330,6 +342,36 @@ def __init__(self, omap_state, gateway_state, rpc_lock: threading.Lock) -> None: "gateway", "omap_file_lock_retry_sleep_interval", 1.0) + # This is a development flag, in case we run into issues with the read lock in the field + self.omap_file_lock_on_read = self.omap_state.config.getboolean_with_default( + "gateway", + "omap_file_lock_on_read", + True) + if not self.omap_file_lock_on_read and not OmapLock.no_read_lock_warning_displayed: + self.logger.warning("Will not lock OMAP for read, this might cause using an " + "inconsistent state when big OMAP file are used") + with OmapLock.changes_lock: + OmapLock.no_read_lock_warning_displayed = True + # This is an option for development, normally we shouldn't get errors on unlock so + # the flag shouldn't make a difference. There might be case in which for some reason + # we take too long to handle something and the RBD unlocked the lock in the middle + # of the processing. The deafult is to abort the gateway in such case. If we looked + # into the case and decidd there was no problem, or we want to continue anyway we + # can either increase the duration of the lock (omap_file_lock_duration) or set this + # flag to False, which will cause the gateway to just display an error message and + # continue. We might also get an error on read unlock for some unknown reason, + # this is not a critical condition so we can just set the flag to False to let + # the gate continue. + self.omap_file_ignore_unlock_errors = self.omap_state.config.getboolean_with_default( + "gateway", + "omap_file_ignore_unlock_errors", + False) + if self.omap_file_ignore_unlock_errors: + if not OmapLock.ignore_errors_warning_displayed: + self.logger.warning("OMAP unlock errors will be ignored, the gateway will continue") + with OmapLock.changes_lock: + OmapLock.ignore_errors_warning_displayed = True + self.lock_start_time = 0.0 # This is used for testing purposes only. To allow us testing locking # from two gateways at the same time @@ -340,34 +382,36 @@ def __init__(self, omap_state, gateway_state, rpc_lock: threading.Lock) -> None: if self.omap_file_disable_unlock: self.logger.warning("Will not unlock OMAP file for testing purposes") + def build_omap_lock_cookie(self, exclusive_lock=True, cookie_suffix=None) -> str: + if cookie_suffix: + cookie_suffix = str(cookie_suffix) + "_" + else: + cookie_suffix = "" + + cookie_prefix = f"{OmapLock.OMAP_FILE_LOCK_COOKIE_PREFIX}_" \ + f"{hex(id(self))}_{os.getpid()}_{threading.get_native_id()}_" \ + f"{cookie_suffix}" + + if exclusive_lock: + omap_lock_cookie = f"{cookie_prefix}" \ + f"{OmapLock.EXCLUSIVE_LOCK_NAME}" + else: + omap_lock_cookie = f"{cookie_prefix}" \ + f"{OmapLock.SHARED_LOCK_NAME}" + + return omap_lock_cookie + # # We pass the context from the different functions here. It should point to a real object # in case we come from a real resource changing function, resulting from a CLI command. It # will be None in case we come from an automatic update which is done because the local # state is out of date. In case context is None, that is we're in the middle of an update # we should not try to lock the OMAP file as the code will not try to make changes there, - # only the local spdk calls are done in such a case. + # only the local SPDK calls are done in such a case. # - def __enter__(self): - if self.omap_file_lock_duration > 0: - self.lock_omap() - self.lock_start_time = time.monotonic() - return self - - def __exit__(self, typ, value, traceback): - if self.omap_file_lock_duration > 0: - duration = 0.0 - if self.lock_start_time: - duration = time.monotonic() - self.lock_start_time - self.lock_start_time = 0.0 - self.unlock_omap() - if duration > self.omap_file_lock_duration: - self.logger.error(f"Operation ran for {duration:.2f} seconds, but the OMAP " - f"lock expired after {self.omap_file_lock_duration} seconds") - def get_omap_lock_to_use(self, context): if context: - return self + return OmapWriteGuard(self) return contextlib.suppress() # @@ -394,83 +438,287 @@ def execute_omap_locking_function(self, grpc_func, omap_locking_func, request, c time.sleep(1) if need_to_update: - raise Exception(f"Unable to lock OMAP file after reloading " - f"{self.omap_file_update_reloads} times, exiting") + raise RuntimeError(f"Unable to lock OMAP file after reloading " + f"{self.omap_file_update_reloads} times, exiting") - def lock_omap(self): + def lock_omap(self, verify_versions=True, lock_exclusive=True, cookie_suffix=None): got_lock = False - assert self.rpc_lock.locked(), "The RPC lock is not locked." + if lock_exclusive: + assert self.rpc_lock and self.rpc_lock.locked(), \ + "The RPC lock is not locked for exclusive OMAP lock." + + if self.omap_file_lock_duration <= 0: + raise RuntimeError("Lock duration set to 0, should not try to lock OMAP") if not self.omap_state.ioctx: self.logger.warning("Not locking OMAP as Rados connection is closed") - raise Exception("An attempt to lock OMAP file after Rados connection was closed") + raise RuntimeError("An attempt to lock OMAP file after Rados connection was closed") + + if lock_exclusive: + lock_kind = OmapLock.EXCLUSIVE_LOCK_NAME + else: + lock_kind = OmapLock.SHARED_LOCK_NAME + + lock_cookie = self.build_omap_lock_cookie(lock_exclusive, cookie_suffix) + i = 0 + while i <= self.omap_file_lock_retries: + if not self.gateway_state.update_is_active_lock.locked(): + i += 1 - for i in range(0, self.omap_file_lock_retries + 1): try: - self.omap_state.ioctx.lock_exclusive(self.omap_state.omap_name, - self.OMAP_FILE_LOCK_NAME, - self.OMAP_FILE_LOCK_COOKIE, - "OMAP file changes lock", - self.omap_file_lock_duration, 0) + if lock_exclusive: + self.omap_state.ioctx.lock_exclusive(self.omap_state.omap_name, + self.OMAP_FILE_LOCK_NAME, + lock_cookie, + "OMAP file changes lock", + self.omap_file_lock_duration, 0) + else: + self.omap_state.ioctx.lock_shared(self.omap_state.omap_name, + self.OMAP_FILE_LOCK_NAME, + lock_cookie, + "", + "OMAP file changes lock", + self.omap_file_lock_duration, 0) got_lock = True + self.logger.debug(f"Locked OMAP {lock_kind}, thread id: " + f"{threading.get_native_id()}," + f" id: {self.omap_state.id_text}, cookie: " + f"{lock_cookie}") if i > 0: - self.logger.info(f"Succeeded to lock OMAP file after {i} retries") + self.logger.info(f"Succeeded to lock OMAP file ({lock_kind}) " + f"after {i} retries") break except rados.ObjectExists: - self.logger.info("We already locked the OMAP file") - got_lock = True - break + self.logger.debug(f"OMAP was already locked by {OmapLock.locked_by} " + f", thread id: {threading.get_native_id()}, id: " + f"{self.omap_state.id_text}, cookie: " + f"{lock_cookie}") + if lock_exclusive: + raise RuntimeError("An attempt to lock OMAP exclusively twice from " + "the same thread") + else: + assert False, "Shouldn't get an ObjectExists exception for a shared lock" except rados.ObjectBusy: - self.logger.warning( - f"The OMAP file is locked, will try again in " - f"{self.omap_file_lock_retry_sleep_interval} seconds") - with ReleasedLock(self.rpc_lock): - time.sleep(self.omap_file_lock_retry_sleep_interval) + if not lock_exclusive and self.write_locked_by_me(): + self.logger.info("No need to lock OMAP for read as we already " + "have it locked for write") + raise FileExistsError("already hold write lock") + if len(OmapLock.locked_by) > 0: + self.logger.debug(f"OMAP is locked by {OmapLock.locked_by} with cookie: " + f"{OmapLock.lock_cookie}, thread id: " + f"{threading.get_native_id()}, id: " + f"{self.omap_state.id_text}, cookie: " + f"{lock_cookie}") + else: + self.logger.debug("OMAP is locked by an external locker") + except AttributeError: + # We got here beause ioctx was closed before trying to lock + self.logger.exception("Got an exception trying to lock") + raise RuntimeError("An attempt to lock OMAP file after Rados connection was closed") except Exception: - self.logger.exception("Unable to lock OMAP file, exiting") - raise + self.logger.exception(f"Unable to lock OMAP file ({lock_kind}), exiting") + raise RuntimeError(f"Unable to lock OMAP file ({lock_kind}), exiting") + + time_to_sleep = self.omap_file_lock_retry_sleep_interval + if not lock_exclusive: + time_to_sleep /= 2.0 + self.logger.warning( + f"The OMAP file is locked, will try again in " + f"{time_to_sleep} seconds") + if lock_exclusive: + with ReleasedLock(self.rpc_lock): + time.sleep(time_to_sleep) + else: + time.sleep(time_to_sleep) if not got_lock: - self.logger.error(f"Unable to lock OMAP file after {self.omap_file_lock_retries} " - f"tries. Exiting!") - raise Exception("Unable to lock OMAP file") - - self.is_locked = True - omap_version = self.omap_state.get_omap_version() - local_version = self.omap_state.get_local_version() - - if omap_version > local_version: - self.logger.warning(f"Local version {local_version} differs from OMAP file " - f"version {omap_version}. The file is not current, will " - f"reload it and try again") - self.unlock_omap() - raise OSError(errno.EAGAIN, - "Unable to lock OMAP file, file not current", - self.omap_state.omap_name) - - def unlock_omap(self): + self.logger.error(f"Unable to lock OMAP file ({lock_kind}) after " + f"{self.omap_file_lock_retries} tries. Exiting!") + raise RuntimeError(f"Unable to lock OMAP file ({lock_kind})") + + with OmapLock.changes_lock: + if lock_exclusive: + if OmapLock.is_exclusively_locked: + assert False, \ + f"Got two exclusive locks, OMAP is locked by " \ + f"{OmapLock.locked_by} with cookie: " \ + f"{OmapLock.lock_cookie}, thread id: " \ + f"{threading.get_native_id()}, id: " \ + f"{self.omap_state.id_text}, cookie: " \ + f"{lock_cookie}" + else: + assert not len(OmapLock.locked_by) and not len(OmapLock.lock_cookie), \ + f"Got exclusive lock with shared locks, OMAP is locked by " \ + f"{OmapLock.locked_by} with cookie: " \ + f"{OmapLock.lock_cookie}, thread id: " \ + f"{threading.get_native_id()}, id: " \ + f"{self.omap_state.id_text}, cookie: " \ + f"{lock_cookie}" + + try: + OmapLock.locked_by[(threading.get_native_id(), lock_kind)] += 1 + except KeyError: + OmapLock.locked_by[(threading.get_native_id(), lock_kind)] = 1 + if lock_exclusive: + OmapLock.is_exclusively_locked = True + OmapLock.lock_cookie.append(lock_cookie) + + if verify_versions: + omap_version = self.omap_state.get_omap_version() + local_version = self.omap_state.get_local_version() + + if omap_version > local_version: + self.logger.warning(f"Local version {local_version} differs from OMAP file " + f"version {omap_version}. The file is not current, will " + f"reload it and try again") + self.unlock_omap() + raise OSError(errno.EAGAIN, + "Unable to lock OMAP file, file not current", + self.omap_state.omap_name) + + def do_unlock_omap(self, lock_cookie, lock_kind=""): + try: + self.omap_state.ioctx.unlock(self.omap_state.omap_name, + self.OMAP_FILE_LOCK_NAME, + lock_cookie) + self.logger.debug(f"OMAP was unlocked, thread id: " + f"{threading.get_native_id()}, " + f"id: {self.omap_state.id_text}, cookie: " + f"{lock_cookie}") + except rados.ObjectNotFound: + self.logger.debug(f"OMAP lock not found, thread id: " + f"{threading.get_native_id()}, " + f"id: {self.omap_state.id_text}, cookie: " + f"{lock_cookie}") + self.logger.error(f"No such lock, the {lock_kind} lock might have expired." + f" Consider enlarging the OMAP lock duration field.") + if not self.omap_file_ignore_unlock_errors: + raise + except Exception: + self.logger.exception(f"Unable to {lock_kind} unlock OMAP file") + if not self.omap_file_ignore_unlock_errors: + raise + + def unlock_omap(self, unlock_exclusive=True, cookie_suffix=None): if self.omap_file_disable_unlock: self.logger.warning("OMAP file unlock was disabled, will not unlock file") return - if not self.omap_state.ioctx: - self.is_locked = False - return + if unlock_exclusive: + lock_kind = OmapLock.EXCLUSIVE_LOCK_NAME + else: + lock_kind = OmapLock.SHARED_LOCK_NAME + + lock_cookie = self.build_omap_lock_cookie(unlock_exclusive, cookie_suffix) + with OmapLock.changes_lock: + if self.omap_state.ioctx: + self.do_unlock_omap(lock_cookie, lock_kind) + else: + self.logger.warning("Trying to unlock OMAP when Rados connection is closed") + return + + try: + OmapLock.locked_by[(threading.get_native_id(), lock_kind)] -= 1 + if not OmapLock.locked_by[(threading.get_native_id(), lock_kind)]: + OmapLock.locked_by.pop((threading.get_native_id(), lock_kind), None) + except KeyError: + pass + + OmapLock.lock_cookie.remove(lock_cookie) + if unlock_exclusive: + OmapLock.is_exclusively_locked = False + + def unlock_all_omap(self): + with OmapLock.changes_lock: + cookies = OmapLock.lock_cookie.copy() + for lock_cookie in cookies: + assert self.omap_state.ioctx, "Rados connection got closed in the middle of shutdown" + try: + self.do_unlock_omap(lock_cookie) + except Exception: + pass + OmapLock.reset_lock_markers() + + def write_locked_by_me(self) -> bool: + lock_cookie = self.build_omap_lock_cookie(True) + thread_id = threading.get_native_id() + with OmapLock.changes_lock: + if not OmapLock.is_exclusively_locked: + return False + if (thread_id, OmapLock.EXCLUSIVE_LOCK_NAME) not in OmapLock.locked_by: + return False + return lock_cookie in OmapLock.lock_cookie + + def reset_lock_markers(): + with OmapLock.changes_lock: + OmapLock.is_exclusively_locked = False + OmapLock.locked_by = {} + OmapLock.lock_cookie = [] + + +class OmapReadGuard: + def __init__(self, omap_lock: OmapLock): + self.omap_lock = omap_lock + self.cookie_suffix = None + self.actually_locked = False + self.lock_start_time = 0.0 + + def __enter__(self): try: - self.omap_state.ioctx.unlock(self.omap_state.omap_name, - self.OMAP_FILE_LOCK_NAME, - self.OMAP_FILE_LOCK_COOKIE) - except rados.ObjectNotFound: - if self.is_locked: - self.logger.warning("No such lock, the lock duration might have passed") + if self.omap_lock.omap_file_lock_duration > 0: + self.cookie_suffix = time.time_ns() + self.omap_lock.lock_omap(False, False, self.cookie_suffix) + self.lock_start_time = time.monotonic() + self.actually_locked = True + return self + except FileExistsError: + # we can contiune as we already hold a write lock, but we shouldn't try to unlock + self.actually_locked = False + return self except Exception: - self.logger.exception("Unable to unlock OMAP file") pass - self.is_locked = False + return None - def locked(self): - return self.is_locked + def __exit__(self, typ, value, traceback): + if self.actually_locked: + self.omap_lock.unlock_omap(False, self.cookie_suffix) + duration = 0.0 + if self.lock_start_time: + duration = time.monotonic() - self.lock_start_time + self.lock_start_time = 0.0 + assert duration <= self.omap_lock.omap_file_lock_duration, \ + f"Operation ran for {duration:.2f} seconds, but the " \ + f"OMAP {OmapLock.SHARED_LOCK_NAME} lock expired after " \ + f"{self.omap_lock.omap_file_lock_duration} seconds. Consider " \ + f"enlarging the OMAP lock duration field." + self.cookie_suffix = None + self.actually_locked = False + + +class OmapWriteGuard: + def __init__(self, omap_lock: OmapLock): + self.omap_lock = omap_lock + self.lock_start_time = 0.0 + + def __enter__(self): + if self.omap_lock.omap_file_lock_duration > 0: + self.omap_lock.lock_omap() + self.lock_start_time = time.monotonic() + return self + + def __exit__(self, typ, value, traceback): + if self.omap_lock.omap_file_lock_duration > 0: + self.omap_lock.unlock_omap() + duration = 0.0 + if self.lock_start_time: + duration = time.monotonic() - self.lock_start_time + self.lock_start_time = 0.0 + assert duration <= self.omap_lock.omap_file_lock_duration, \ + f"Operation ran for {duration:.2f} seconds, but the " \ + f"OMAP {OmapLock.EXCLUSIVE_LOCK_NAME} lock expired after " \ + f"{self.omap_lock.omap_file_lock_duration} seconds. Consider " \ + f"enlarging the OMAP lock duration field." class OmapGatewayState(GatewayState): @@ -493,19 +741,26 @@ class OmapGatewayState(GatewayState): OMAP_VERSION_KEY = "omap_version" - def __init__(self, config, id_text=""): + def __init__(self, config, set_gateway_exit_message, id_text=""): self.config = config self.version = 1 self.logger = GatewayLogger(self.config).logger self.ioctx = None self.watch = None + self.omap_lock = None gateway_group = self.config.get("gateway", "group") self.omap_name = f"nvmeof.{gateway_group}.state" if gateway_group else "nvmeof.state" self.notify_timeout = self.config.getint_with_default("gateway", "state_update_timeout_in_msec", 2000) + # This is a development flag, in case we run into issues with gateway crashes in the field + self.abort_on_error = self.config.getboolean_with_default("gateway", + "abort_on_errors", + True) self.conn = None self.id_text = id_text + self.up_and_running = True + self.set_gateway_exit_message = set_gateway_exit_message try: self.ioctx = self.open_rados_connection(self.config) @@ -527,13 +782,6 @@ def __init__(self, config, id_text=""): def __exit__(self, exc_type, exc_value, traceback): self.cleanup_omap() - def check_for_old_format_omap_files(self): - omap_dict = self.get_state() - for omap_item_key in omap_dict.keys(): - if omap_item_key.startswith("bdev"): - raise Exception("Old OMAP file format, still contains bdevs, please " - "remove file and try again") - def open_rados_connection(self, config): ceph_pool = config.get("ceph", "pool") ceph_conf = config.get("ceph", "config_file") @@ -572,24 +820,103 @@ def get_omap_version(self) -> int: f" invalid number of values ({value_list}).") raise - def get_state(self) -> Dict[str, str]: + def read_omap_values(self): """Returns dict of all OMAP keys and values.""" omap_list = [("", 0)] # Dummy, non empty, list value. Just so we would enter the while omap_dict = {} - if not self.ioctx: - self.logger.warning("Trying to get OMAP state when Rados connection is closed") - return omap_dict + # The number of items returned is limited by Ceph, so we need to read in # a loop until no more items are returned + get_omap_vals_count = 0 while len(omap_list) > 0: last_key_read = omap_list[-1][0] with rados.ReadOpCtx() as read_op: + if not self.ioctx: + raise RuntimeError("Trying to get OMAP state when Rados " + "connection is closed") i, _ = self.ioctx.get_omap_vals(read_op, last_key_read, "", -1) self.ioctx.operate_read_op(read_op, self.omap_name) omap_list = list(i) omap_dict.update(dict(omap_list)) + get_omap_vals_count += 1 + + return omap_dict, get_omap_vals_count + + def get_state_internal(self) -> Dict[str, str]: + """Returns dict of all OMAP keys and values.""" + + got_omap_lock = False + get_omap_vals_count = 0 + omap_dict = None + if self.omap_lock and self.omap_lock.omap_file_lock_on_read: + try: + actually_locked = False + with OmapReadGuard(self.omap_lock) as guard: + if guard is None: + self.logger.error(f"Failed to lock OMAP file for read, will try " + f"to read atomically without a lock ({self.id_text})") + else: + got_omap_lock = True + if guard.actually_locked: + actually_locked = True + self.logger.debug(f"Locked OMAP file before reading its " + f"content ({self.id_text})") + else: + self.logger.debug(f"OMAP file is already locked, read its " + f"content ({self.id_text})") + omap_dict, get_omap_vals_count = self.read_omap_values() + if actually_locked: + self.logger.debug(f"Released OMAP file lock after reading " + f"content ({self.id_text})") + if omap_dict is not None: + assert got_omap_lock + return omap_dict + except Exception: + self.logger.exception(f"Failed to lock OMAP file ({self.id_text})") + # We passed the lock, so we got an exception trying to read + if got_omap_lock: + raise + + assert omap_dict is None + assert not got_omap_lock + omap_dict, get_omap_vals_count = self.read_omap_values() + + if get_omap_vals_count > 2: + # We couldn't lock and read OMAP in several calls, which is not atomic + raise RuntimeError("We failed locking the OMAP file and we can't read it atomically") + return omap_dict + def get_state(self, allow_abort_on_error=True) -> Dict[str, str]: + """Returns dict of all OMAP keys and values.""" + + if not self.up_and_running: + return None + + try: + return self.get_state_internal() + except Exception: + self.logger.exception(f"Failure while getting state ({self.id_text})") + if self.abort_on_error and allow_abort_on_error: + msg = f"Will abort because of an error getting state ({self.id_text})" + self.logger.critical(msg) + self.up_and_running = False + if self.set_gateway_exit_message is not None: + if not self.set_gateway_exit_message(msg): + self.logger.warning(f"Can't get an indication about the gateway aborting." + f" Will continue after an error " + f"getting state ({self.id_text})") + raise + else: + self.logger.warning(f"No gateway exit function set, will continue after " + f"an error getting state ({self.id_text})") + raise + else: + self.logger.warning(f"Abort on errors is disabled, will continue after " + f"an error getting state ({self.id_text})") + raise + return None + def _add_key(self, key: str, val: str): """Adds key and value to the OMAP.""" if not self.ioctx: @@ -689,7 +1016,10 @@ def cleanup_omap(self, omap_lock=None): pass if omap_lock and omap_lock.omap_file_lock_duration > 0: try: - omap_lock.unlock_omap() + # We already shutting down, no point in raising exceptions now + omap_lock.omap_file_ignore_unlock_errors = True + omap_lock.unlock_all_omap() + self.logger.debug(f"Unlocked all OMAP locks ({self.id_text})") except Exception: pass if self.ioctx: @@ -733,8 +1063,12 @@ def __init__(self, config, local, omap, gateway_rpc_caller, crypto, id_text=""): self.update_interval = 1 self.use_notify = self.config.getboolean("gateway", "state_update_notify") + self.break_update_interval = self.config.getint_with_default("gateway", + "break_update_interval_sec", + 25) self.update_is_active_lock = threading.Lock() self.id_text = id_text + self.up_and_running = True def add_namespace(self, subsystem_nqn: str, nsid: str, val: str): """Adds a namespace to the state data store.""" @@ -824,6 +1158,9 @@ def start_update(self): def _update_caller(self, notify_event): """Periodically calls for update.""" while True: + if not self.up_and_running: + self.logger.warning("Server is going down, stop updates") + break update_time = time.time() + self.update_interval self.update() notify_event.wait(max(update_time - time.time(), 0)) @@ -1089,24 +1426,35 @@ def update(self) -> bool: self.logger.warning("An update is already running, ignore") return False - if not self.omap.ioctx: - self.logger.warning("Can't update when Rados connection is closed") - return False - with self.update_is_active_lock: prefix_list = [ GatewayState.SUBSYSTEM_PREFIX, GatewayState.HOST_PREFIX, - GatewayState.LISTENER_PREFIX, GatewayState.NAMESPACE_PREFIX, GatewayState.NAMESPACE_QOS_PREFIX, GatewayState.NAMESPACE_HOST_PREFIX, + GatewayState.LISTENER_PREFIX, ] + if not self.omap.ioctx: + self.logger.warning("Can't update when Rados connection is closed") + return False + # Get version and state from OMAP - omap_state_dict = self.omap.get_state() - omap_version = int(omap_state_dict[self.omap.OMAP_VERSION_KEY]) - local_version = self.omap.get_local_version() + try: + omap_state_dict = self.omap.get_state(False) + if omap_state_dict is None: + return False + omap_version = int(omap_state_dict[self.omap.OMAP_VERSION_KEY]) + local_version = self.omap.get_local_version() + except RuntimeError: + self.logger.exception("Failure getting OMAP state") + return False + except Exception: + if not self.omap.ioctx: + self.logger.warning("Can't update when Rados connection is closed") + return False + raise self.logger.debug(f"Check local version {local_version} against OMAP version " f"{omap_version} ({self.id_text}).") @@ -1119,14 +1467,11 @@ def update(self) -> bool: # Find OMAP additions added_keys = omap_state_keys - local_state_keys - self.logger.info(f"Added keys: {added_keys}") + self.logger.debug(f"Added keys: {added_keys}") added = {key: omap_state_dict[key] for key in added_keys} grouped_added = self._group_by_prefix(added, prefix_list) # Find OMAP changes same_keys = omap_state_keys & local_state_keys - for key in same_keys: - self.logger.debug(f"same key: {key}, local: {local_state_dict[key]}," - f"omap: {omap_state_dict[key]}") changed = { key: omap_state_dict[key] for key in same_keys @@ -1142,16 +1487,14 @@ def update(self) -> bool: only_host_key_changed = [] only_subsystem_key_changed = [] for key in changed.keys(): - self.logger.info(f"Changed key: {key} local-state: {local_state_dict[key]}" - f" omap-state: {omap_state_dict[key]}") if key.startswith(GatewayState.NAMESPACE_PREFIX): (should_process, new_lb_grp_id) = self.namespace_only_lb_group_id_changed( local_state_dict[key], omap_state_dict[key]) if should_process: assert new_lb_grp_id, "Shouldn't get here with an empty lb group id" - self.logger.info(f"Found {key} where only the load balancing group id " - f"has changed. The new group id is {new_lb_grp_id}") + self.logger.debug(f"Found {key} where only the load balancing group id " + f"has changed. The new group id is {new_lb_grp_id}") only_lb_group_changed.append((key, new_lb_grp_id)) (should_process, @@ -1380,9 +1723,9 @@ def _update_call_rpc(self, grouped_state_update, is_add_req, prefix_list): for prefix in prefix_list: component_update = grouped_state_update.get(prefix, {}) if component_update: - self.gateway_rpc_caller(component_update, True) + self.gateway_rpc_caller(component_update, True, self.break_update_interval) else: for prefix in list(reversed(prefix_list)): component_update = grouped_state_update.get(prefix, {}) if component_update: - self.gateway_rpc_caller(component_update, False) + self.gateway_rpc_caller(component_update, False, self.break_update_interval) diff --git a/control/utils.py b/control/utils.py index 9e5ba2b364e..cf758ced898 100644 --- a/control/utils.py +++ b/control/utils.py @@ -209,6 +209,8 @@ def is_valid_nqn(nqn): class GatewayUtilsCrypto: KEY_SIZE = 32 INVALID_KEY_VALUE = "" + KEY_START = "-----BEGIN PRIVATE KEY-----" + KEY_END = "-----END PRIVATE KEY-----" def __init__(self, encryption_key: bytes): if encryption_key: @@ -220,17 +222,24 @@ def __init__(self, encryption_key: bytes): def read_encryption_key(cls, keyfile: str) -> bytes: keyval = "" encoded_key = None + # a valid key has several lines but cephadm has an issue when exporting + # key values and will change newlines to spaces, so handle both cases try: with open(keyfile) as f: for line in f: - if line.startswith("-----BEGIN PRIVATE KEY-----"): - continue - if line.startswith("-----END PRIVATE KEY-----"): - continue - keyval += line.rstrip('\n') + keyval += line.strip() except FileNotFoundError: return None + if not keyval: + raise RuntimeError("Invalid encryption key, key is empty") + + if not keyval.startswith(cls.KEY_START): + raise RuntimeError("Invalid encryption key, doesn't start with start marker") + if not keyval.endswith(cls.KEY_END): + raise RuntimeError("Invalid encryption key, doesn't end with end marker") + keyval = keyval.removeprefix(cls.KEY_START).removesuffix(cls.KEY_END).replace(" ", "") + keybytes = base64.b64decode(keyval, validate=True) if len(keybytes) < cls.KEY_SIZE: raise RuntimeError(f"Encryption key has length {len(keybytes)} which is too short. " @@ -512,7 +521,8 @@ def compress_final_log_file(self, gw_name): class NICS: - def __init__(self, handle_all=False): + def __init__(self, logger=None, handle_all=False): + self.logger = logger self.ignored_device_prefixes = ('lo') self.addresses = {} self.adapters = {} @@ -521,10 +531,20 @@ def __init__(self, handle_all=False): self._build_adapter_info() def _build_adapter_info(self): - for device_name in netifaces.interfaces(): + interfaces = netifaces.interfaces() + if self.logger: + self.logger.debug(f"Network interfaces: {interfaces}") + for device_name in interfaces: if device_name.startswith(self.ignored_device_prefixes): continue - nic = NIC(device_name) + try: + nic = NIC(device_name) + except Exception: + if self.logger: + self.logger.exception(f"Error in interface {device_name}") + continue + if self.logger: + self.logger.debug(f"interface {device_name}: {nic}") for ipv4_addr in nic.ipv4_addresses: self.addresses[ipv4_addr] = device_name for ipv6_addr in nic.ipv6_addresses: diff --git a/mk/containerized.mk b/mk/containerized.mk index 97a5388359a..54c62d298bd 100644 --- a/mk/containerized.mk +++ b/mk/containerized.mk @@ -20,7 +20,7 @@ $(DOCKER_COMPOSE_COMMANDS): pull: ## Download SVC images build: ## Build SVC images -build: DOCKER_COMPOSE_ENV = DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 +build: DOCKER_COMPOSE_ENV = DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 COMPOSE_PROFILES=build push: QUAY := $(CONTAINER_REGISTRY) push: IMAGES := nvmeof nvmeof-cli diff --git a/pdm.lock b/pdm.lock index 6c2c9df55ac..a9a1aedca56 100644 --- a/pdm.lock +++ b/pdm.lock @@ -96,53 +96,66 @@ files = [ [[package]] name = "cryptography" -version = "44.0.0" +version = "45.0.2" requires_python = "!=3.9.0,!=3.9.1,>=3.7" summary = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." groups = ["default"] dependencies = [ - "cffi>=1.12; platform_python_implementation != \"PyPy\"", + "cffi>=1.14; platform_python_implementation != \"PyPy\"", ] files = [ - {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831c3c4d0774e488fdc83a1923b49b9957d33287de923d58ebd3cec47a0ae43f"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, - {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, - {file = "cryptography-44.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:abc998e0c0eee3c8a1904221d3f67dcfa76422b23620173e28c11d3e626c21bd"}, - {file = "cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:660cb7312a08bc38be15b696462fa7cc7cd85c3ed9c576e81f4dc4d8b2b31591"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404fdc66ee5f83a1388be54300ae978b2efd538018de18556dde92575e05defc"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, - {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, - {file = "cryptography-44.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:708ee5f1bafe76d041b53a4f95eb28cdeb8d18da17e597d46d7833ee59b97ede"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37d76e6863da3774cd9db5b409a9ecfd2c71c981c38788d3fcfaf177f447b731"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5e7cb1e5e56ca0933b4873c0220a78b773b24d40d186b6738080b73d3d0a756"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:8b3e6eae66cf54701ee7d9c83c30ac0a1e3fa17be486033000f2a73a12ab507c"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:be4ce505894d15d5c5037167ffb7f0ae90b7be6f2a98f9a5c3442395501c32fa"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:62901fb618f74d7d81bf408c8719e9ec14d863086efe4185afd07c352aee1d2c"}, - {file = "cryptography-44.0.0.tar.gz", hash = "sha256:cd4e834f340b4293430701e772ec543b0fbe6c2dea510a5286fe0acabe153a02"}, + {file = "cryptography-45.0.2-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:61a8b1bbddd9332917485b2453d1de49f142e6334ce1d97b7916d5a85d179c84"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cc31c66411e14dd70e2f384a9204a859dc25b05e1f303df0f5326691061b839"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:463096533acd5097f8751115bc600b0b64620c4aafcac10c6d0041e6e68f88fe"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:cdafb86eb673c3211accffbffdb3cdffa3aaafacd14819e0898d23696d18e4d3"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:05c2385b1f5c89a17df19900cfb1345115a77168f5ed44bdf6fd3de1ce5cc65b"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:e9e4bdcd70216b08801e267c0b563316b787f957a46e215249921f99288456f9"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b2de529027579e43b6dc1f805f467b102fb7d13c1e54c334f1403ee2b37d0059"}, + {file = "cryptography-45.0.2-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10d68763892a7b19c22508ab57799c4423c7c8cd61d7eee4c5a6a55a46511949"}, + {file = "cryptography-45.0.2-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2a90ce2f0f5b695e4785ac07c19a58244092f3c85d57db6d8eb1a2b26d2aad6"}, + {file = "cryptography-45.0.2-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:59c0c8f043dd376bbd9d4f636223836aed50431af4c5a467ed9bf61520294627"}, + {file = "cryptography-45.0.2-cp311-abi3-win32.whl", hash = "sha256:80303ee6a02ef38c4253160446cbeb5c400c07e01d4ddbd4ff722a89b736d95a"}, + {file = "cryptography-45.0.2-cp311-abi3-win_amd64.whl", hash = "sha256:7429936146063bd1b2cfc54f0e04016b90ee9b1c908a7bed0800049cbace70eb"}, + {file = "cryptography-45.0.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:e86c8d54cd19a13e9081898b3c24351683fd39d726ecf8e774aaa9d8d96f5f3a"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e328357b6bbf79928363dbf13f4635b7aac0306afb7e5ad24d21d0c5761c3253"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49af56491473231159c98c2c26f1a8f3799a60e5cf0e872d00745b858ddac9d2"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f169469d04a23282de9d0be349499cb6683b6ff1b68901210faacac9b0c24b7d"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9cfd1399064b13043082c660ddd97a0358e41c8b0dc7b77c1243e013d305c344"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:18f8084b7ca3ce1b8d38bdfe33c48116edf9a08b4d056ef4a96dceaa36d8d965"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2cb03a944a1a412724d15a7c051d50e63a868031f26b6a312f2016965b661942"}, + {file = "cryptography-45.0.2-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a9727a21957d3327cf6b7eb5ffc9e4b663909a25fea158e3fcbc49d4cdd7881b"}, + {file = "cryptography-45.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ddb8d01aa900b741d6b7cc585a97aff787175f160ab975e21f880e89d810781a"}, + {file = "cryptography-45.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c0c000c1a09f069632d8a9eb3b610ac029fcc682f1d69b758e625d6ee713f4ed"}, + {file = "cryptography-45.0.2-cp37-abi3-win32.whl", hash = "sha256:08281de408e7eb71ba3cd5098709a356bfdf65eebd7ee7633c3610f0aa80d79b"}, + {file = "cryptography-45.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:48caa55c528617fa6db1a9c3bf2e37ccb31b73e098ac2b71408d1f2db551dde4"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a8ec324711596fbf21837d3a5db543937dd84597d364769b46e0102250023f77"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:965611880c3fa8e504b7458484c0697e00ae6e937279cd6734fdaa2bc954dc49"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d891942592789fa0ab71b502550bbadb12f540d7413d7d7c4cef4b02af0f5bc6"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:b19f4b28dd2ef2e6d600307fee656c00825a2980c4356a7080bd758d633c3a6f"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:7c73968fbb7698a4c5d6160859db560d3aac160edde89c751edd5a8bc6560c88"}, + {file = "cryptography-45.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:501de1296b2041dccf2115e3c7d4947430585601b251b140970ce255c5cfb985"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1655d3a76e3dedb683c982a6c3a2cbfae2d08f47a48ec5a3d58db52b3d29ea6f"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc7693573f16535428183de8fd27f0ca1ca37a51baa0b41dc5ed7b3d68fe80e2"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:614bca7c6ed0d8ad1dce683a6289afae1f880675b4090878a0136c3da16bc693"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:4142e20c29224cec63e9e32eb1e6014fb285fe39b7be66b3564ca978a3a8afe9"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:9a900036b42f7324df7c7ad9569eb92ba0b613cf699160dd9c2154b24fd02f8e"}, + {file = "cryptography-45.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:057723b79752a142efbc609e90b0dff27b0361ccbee3bd48312d70f5cdf53b78"}, + {file = "cryptography-45.0.2.tar.gz", hash = "sha256:d784d57b958ffd07e9e226d17272f9af0c41572557604ca7554214def32c26bf"}, ] [[package]] name = "exceptiongroup" -version = "1.2.2" +version = "1.3.0" requires_python = ">=3.7" summary = "Backport of PEP 654 (exception groups)" groups = ["test"] marker = "python_version < \"3.11\"" +dependencies = [ + "typing-extensions>=4.6.0; python_version < \"3.13\"", +] files = [ - {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, - {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, + {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, + {file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"}, ] [[package]] @@ -226,13 +239,13 @@ files = [ [[package]] name = "iniconfig" -version = "2.0.0" -requires_python = ">=3.7" +version = "2.1.0" +requires_python = ">=3.8" summary = "brain-dead simple config-ini parsing" groups = ["test"] files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, + {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, + {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] [[package]] @@ -250,24 +263,24 @@ files = [ [[package]] name = "packaging" -version = "24.2" +version = "25.0" requires_python = ">=3.8" summary = "Core utilities for Python packages" groups = ["test"] files = [ - {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, - {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, + {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, + {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, ] [[package]] name = "pluggy" -version = "1.5.0" -requires_python = ">=3.8" +version = "1.6.0" +requires_python = ">=3.9" summary = "plugin and hook calling mechanisms for python" groups = ["test"] files = [ - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] [[package]] @@ -283,20 +296,20 @@ files = [ [[package]] name = "protobuf" -version = "4.25.5" +version = "4.25.7" requires_python = ">=3.8" summary = "" groups = ["default"] files = [ - {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"}, - {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"}, - {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"}, - {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"}, - {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"}, - {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"}, - {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"}, - {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"}, - {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"}, + {file = "protobuf-4.25.7-cp310-abi3-win32.whl", hash = "sha256:dc582cf1a73a6b40aa8e7704389b8d8352da616bc8ed5c6cc614bdd0b5ce3f7a"}, + {file = "protobuf-4.25.7-cp310-abi3-win_amd64.whl", hash = "sha256:cd873dbddb28460d1706ff4da2e7fac175f62f2a0bebc7b33141f7523c5a2399"}, + {file = "protobuf-4.25.7-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:4c899f09b0502eb39174c717ccf005b844ea93e31137c167ddcacf3e09e49610"}, + {file = "protobuf-4.25.7-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:6d2f5dede3d112e573f0e5f9778c0c19d9f9e209727abecae1d39db789f522c6"}, + {file = "protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:d41fb7ae72a25fcb79b2d71e4247f0547a02e8185ed51587c22827a87e5736ed"}, + {file = "protobuf-4.25.7-cp39-cp39-win32.whl", hash = "sha256:2f738d4f341186e697c4cdd0e03143ee5cf6cf523790748e61273a51997494c3"}, + {file = "protobuf-4.25.7-cp39-cp39-win_amd64.whl", hash = "sha256:3629b34b65f6204b17adf4ffe21adc8e85f6c6c0bc2baf3fb001b0d343edaebb"}, + {file = "protobuf-4.25.7-py3-none-any.whl", hash = "sha256:e9d969f5154eaeab41404def5dcf04e62162178f4b9de98b2d3c1c70f5f84810"}, + {file = "protobuf-4.25.7.tar.gz", hash = "sha256:28f65ae8c14523cc2c76c1e91680958700d3eac69f45c96512c12c63d9a38807"}, ] [[package]] @@ -313,7 +326,7 @@ files = [ [[package]] name = "pytest" -version = "8.3.4" +version = "8.3.5" requires_python = ">=3.8" summary = "pytest: simple powerful testing with Python" groups = ["test"] @@ -326,8 +339,8 @@ dependencies = [ "tomli>=1; python_version < \"3.11\"", ] files = [ - {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, - {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, + {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, + {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, ] [[package]] @@ -387,13 +400,13 @@ files = [ [[package]] name = "setuptools" -version = "75.8.0" +version = "80.8.0" requires_python = ">=3.9" summary = "Easily download, build, install, upgrade, and uninstall Python packages" groups = ["default"] files = [ - {file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"}, - {file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"}, + {file = "setuptools-80.8.0-py3-none-any.whl", hash = "sha256:95a60484590d24103af13b686121328cc2736bee85de8936383111e421b9edc0"}, + {file = "setuptools-80.8.0.tar.gz", hash = "sha256:49f7af965996f26d43c8ae34539c8d99c5042fbff34302ea151eaa9c207cd257"}, ] [[package]] @@ -448,3 +461,15 @@ files = [ {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"}, {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] + +[[package]] +name = "typing-extensions" +version = "4.13.2" +requires_python = ">=3.8" +summary = "Backported and Experimental Type Hints for Python 3.8+" +groups = ["test"] +marker = "python_version < \"3.11\"" +files = [ + {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"}, + {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"}, +] diff --git a/pyproject.toml b/pyproject.toml index d17384322b1..e41ec769498 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "pdm.backend" [project] name = "ceph-nvmeof" -version = "1.4.4" +version = "1.4.23" description = "Service to provide Ceph storage over NVMe-oF protocol" readme = "README.md" requires-python = ">3.9.1" diff --git a/spdk b/spdk index 7b0a49b0dca..99ea576d239 160000 --- a/spdk +++ b/spdk @@ -1 +1 @@ -Subproject commit 7b0a49b0dcae22a1b116254aaa36f10c63dba435 +Subproject commit 99ea576d2399646848c9343eb9d9f1d9ea420aec diff --git a/tests/atom/clusterBuildTestsRun.sh b/tests/atom/clusterBuildTestsRun.sh index e8a2f6582ea..7f827d0d001 100755 --- a/tests/atom/clusterBuildTestsRun.sh +++ b/tests/atom/clusterBuildTestsRun.sh @@ -17,6 +17,7 @@ ATOM_SHA=$4 ACTION_URL=$5 NIGHTLY=$6 +echo "CEPH_SHA found is: $CEPH_SHA" RUNNER_FOLDER='/home/cephnvme/actions-runner-ceph-m7' BUSY_FILE='/home/cephnvme/busyServer.txt' RUNNER_NIGHTLY_FOLDER='/home/cephnvme/actions-runner-ceph-m8' @@ -76,12 +77,12 @@ if [ "$NIGHTLY" != "nightly" ]; then --cli-img=quay.io/ceph/nvmeof-cli:"$VERSION" \ --initiators=1 \ --gw-group-num=1 \ - --gw-num=4 \ + --gw-num=2 \ --gw-to-stop-num=1 \ --gw-scale-down-num=1 \ --subsystem-num=2 \ --ns-num=4 \ - --subsystem-max-ns-num=1024 \ + --subsystem-max-ns-num=2048 \ --failover-num=2 \ --failover-num-after-upgrade=2 \ --rbd-size=200M \ @@ -102,6 +103,8 @@ if [ "$NIGHTLY" != "nightly" ]; then --journalctl-to-console \ --dont-power-off-cloud-vms \ --skip-lb-group-change-test \ + --skip-gw-failover-latency-test \ + --skip-reservations-basic-test \ --ibm-cloud-key=nokey \ --github-nvmeof-token=nokey \ --env=m7 @@ -119,12 +122,12 @@ else --cli-img=quay.io/ceph/nvmeof-cli:"$VERSION" \ --initiators=1 \ --gw-group-num=1 \ - --gw-num=4 \ + --gw-num=8 \ --gw-to-stop-num=1 \ --gw-scale-down-num=1 \ --subsystem-num=4 \ --ns-num=230 \ - --subsystem-max-ns-num=1024 \ + --subsystem-max-ns-num=2048 \ --failover-num=10 \ --failover-num-after-upgrade=2 \ --rbd-size=200M \ @@ -145,6 +148,7 @@ else --skip-lb-group-change-test \ --skip-block-list-test \ --skip-multi-hosts-conn-test \ + --skip-reservations-basic-test \ --ibm-cloud-key=nokey \ --github-nvmeof-token=nokey \ --env=m8 diff --git a/tests/ceph-nvmeof.cluster_pool.conf b/tests/ceph-nvmeof.cluster_pool.conf index 3b2ef5798fb..1b377f523bd 100644 --- a/tests/ceph-nvmeof.cluster_pool.conf +++ b/tests/ceph-nvmeof.cluster_pool.conf @@ -39,7 +39,7 @@ max_ns_to_change_lb_grp = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_hosts = 2048 -#max_namespaces = 1024 +#max_namespaces = 2048 #max_namespaces_per_subsystem = 256 #max_hosts_per_subsystem = 32 diff --git a/tests/ceph-nvmeof.flat_bdevs_per_cluster.conf b/tests/ceph-nvmeof.flat_bdevs_per_cluster.conf index 43b9b4a3172..89425896946 100644 --- a/tests/ceph-nvmeof.flat_bdevs_per_cluster.conf +++ b/tests/ceph-nvmeof.flat_bdevs_per_cluster.conf @@ -39,7 +39,7 @@ max_ns_to_change_lb_grp = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_hosts = 2048 -#max_namespaces = 1024 +#max_namespaces = 2048 #max_namespaces_per_subsystem = 256 #max_hosts_per_subsystem = 32 diff --git a/tests/ceph-nvmeof.no-huge.conf b/tests/ceph-nvmeof.no-huge.conf index 4996d85fef2..5d3f834320a 100644 --- a/tests/ceph-nvmeof.no-huge.conf +++ b/tests/ceph-nvmeof.no-huge.conf @@ -39,7 +39,7 @@ max_ns_to_change_lb_grp = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_hosts = 2048 -#max_namespaces = 1024 +#max_namespaces = 2048 #max_namespaces_per_subsystem = 256 #max_hosts_per_subsystem = 128 diff --git a/tests/ceph-nvmeof.tls.conf b/tests/ceph-nvmeof.tls.conf index 840ad4970c8..2a5cfbac333 100644 --- a/tests/ceph-nvmeof.tls.conf +++ b/tests/ceph-nvmeof.tls.conf @@ -38,7 +38,7 @@ max_ns_to_change_lb_grp = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_hosts = 2048 -#max_namespaces = 1024 +#max_namespaces = 2048 #max_namespaces_per_subsystem = 256 #max_hosts_per_subsystem = 128 diff --git a/tests/ha/4gws_create_delete.sh b/tests/ha/4gws_create_delete.sh index 5412e437b2c..54073c2bf3a 100755 --- a/tests/ha/4gws_create_delete.sh +++ b/tests/ha/4gws_create_delete.sh @@ -126,7 +126,7 @@ verify_num_namespaces_gw_idx() { GW_NAME=$(gw_name $g) GW_IP=$(gw_ip $g) - for i in $(seq 20); do + for i in $(seq 50); do # timeout after 50*5 seconds echo "verify_num_namespaces $i $GW_NAME $GW_IP" subs=$(docker compose run --rm nvmeof-cli --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | sed 's/Get subsystems://') diff --git a/tests/ha/ceph_status.sh b/tests/ha/ceph_status.sh index ebb3858ca1e..d445cfda39d 100755 --- a/tests/ha/ceph_status.sh +++ b/tests/ha/ceph_status.sh @@ -9,7 +9,7 @@ docker compose exec -T ceph ceph status echo "ℹ️ Step 1: verify 2 gateways" -docker compose exec -T ceph ceph status | grep "2 gateways: 2 active" +docker compose exec -T ceph ceph status | grep "2 gateways active" echo "ℹ️ Step 2: stop a gateway" @@ -19,4 +19,4 @@ sleep 5 echo "ℹ️ Step 3: verify 1 gateway" -docker compose exec -T ceph ceph status | grep "2 gateways: 1 active" +docker compose exec -T ceph ceph status | grep "1 gateway active" diff --git a/tests/ha/demo_test.sh b/tests/ha/demo_test.sh index 3f9d9b24235..1c2084ecf78 100755 --- a/tests/ha/demo_test.sh +++ b/tests/ha/demo_test.sh @@ -480,11 +480,51 @@ function demo_bdevperf_psk() fi set -e + echo "ℹ️ use encryption key like it was exported by cephadm" + docker exec ${NVMEOF_CONTAINER_NAME} rm -f /var/log/ceph/ex_encryption.key /tmp/create_enckey.sh + rm -f /tmp/create_enckey.sh + echo "#!/bin/bash" > /tmp/create_enckey.sh + echo 'echo -n "-----BEGIN PRIVATE KEY----- MIIBVAIBADANBgkqhkiG9w0BAQEFAASCAT4wggE6AgEAAkEAqg+wrkvj9D47BRVi A4tMOv4aBL6RLBbLEwYuhJSLTG6FagZFNknjRj0y9s5C+J0fktl3XMu9UmyUR1LR 3ojPlwIDAQABAkA2F9ONPVp+4CSJ02lf0zkmMpk4FR28NmvV20uEpHNClggqmjmW zFjGV+KHJ//r17gQD3yh+NvJzX9FlncseluBAiEA3MjrizLw6wjsk80IaGL8oQNd cUlD2wYTW6Gk7JLlFmECIQDFL6Chljk3rBoPl0jASBFHq1FT/Zqgg/z060OWBns4 9wIhAKkd3g7J/nCKbWzpaL9M02YiRbk4/ZkPllRiBQqRmpkBAiAgCx9VYu4lZ+hM RE9kP9HfDa4HshygnRJMUrcG+EKp/QIgR5uDteq1fToI5ZbYOf+KJsVoJOpPrN3b vPKX3JuIds8= -----END PRIVATE KEY-----" > /var/log/ceph/ex_encryption.key' >> /tmp/create_enckey.sh + chmod 755 /tmp/create_enckey.sh + docker cp /tmp/create_enckey.sh ${NVMEOF_CONTAINER_NAME}:/tmp/ + docker exec ${NVMEOF_CONTAINER_NAME} /tmp/create_enckey.sh + rm -f /tmp/create_enckey.sh + sed -i 's#encryption_key = /etc/ceph/encryption.key#encryption_key = /var/log/ceph/ex_encryption.key#' ceph-nvmeof.conf + docker restart ${NVMEOF_CONTAINER_NAME} + sleep 20 + cephnvmf_func subsystem add --subsystem ${NQN}7 --no-group-append + cephnvmf_func host add --subsystem ${NQN}7 --host-nqn ${NQN}host21 --psk "${PSK_KEY1}" + make -s exec SVC=ceph OPTS=-T CMD="rados --pool rbd listomapvals nvmeof.state" | grep "host21" + sed -i 's#encryption_key = /var/log/ceph/ex_encryption.key#encryption_key = /etc/ceph/encryption.key#' ceph-nvmeof.conf + docker exec ${NVMEOF_CONTAINER_NAME} rm -f /var/log/ceph/ex_encryption.key /tmp/create_enckey.sh + echo "ℹ️ use invalid encryption key" - sed -i '/enable_key_encryption/d' ceph-nvmeof.conf + docker exec ${NVMEOF_CONTAINER_NAME} rm -f /var/log/ceph/bad_encryption.key /tmp/create_enckey.sh + rm -f /tmp/create_enckey.sh + echo "#!/bin/bash" > /tmp/create_enckey.sh + echo 'echo -n "MIIBVAIBADANBgkqhkiG9w0BAQEFAASCAT4wggE6AgEAAkEAqg+wrkvj9D47BRVi A4tMOv4aBL6RLBbLEwYuhJSLTG6FagZFNknjRj0y9s5C+J0fktl3XMu9UmyUR1LR 3ojPlwIDAQABAkA2F9ONPVp+4CSJ02lf0zkmMpk4FR28NmvV20uEpHNClggqmjmW zFjGV+KHJ//r17gQD3yh+NvJzX9FlncseluBAiEA3MjrizLw6wjsk80IaGL8oQNd cUlD2wYTW6Gk7JLlFmECIQDFL6Chljk3rBoPl0jASBFHq1FT/Zqgg/z060OWBns4 9wIhAKkd3g7J/nCKbWzpaL9M02YiRbk4/ZkPllRiBQqRmpkBAiAgCx9VYu4lZ+hM RE9kP9HfDa4HshygnRJMUrcG+EKp/QIgR5uDteq1fToI5ZbYOf+KJsVoJOpPrN3b vPKX3JuIds8= -----END PRIVATE KEY-----" > /var/log/ceph/bad_encryption.key' >> /tmp/create_enckey.sh + chmod 755 /tmp/create_enckey.sh + docker cp /tmp/create_enckey.sh ${NVMEOF_CONTAINER_NAME}:/tmp/ + docker exec ${NVMEOF_CONTAINER_NAME} /tmp/create_enckey.sh + rm -f /tmp/create_enckey.sh + sed -i 's#encryption_key = /etc/ceph/encryption.key#encryption_key = /var/log/ceph/bad_encryption.key#' ceph-nvmeof.conf + docker restart ${NVMEOF_CONTAINER_NAME} + sleep 20 + cephnvmf_func subsystem add --subsystem ${NQN}8 --no-group-append + set +e + cephnvmf_func host add --subsystem ${NQN}8 --host-nqn ${NQN}host22 --psk "${PSK_KEY1}" + if [[ $? -eq 0 ]]; then + echo "Add host with PSK key should fail without valid encryption key" + exit 1 + fi + set -e + make -s exec SVC=ceph OPTS=-T CMD="rados --pool rbd listomapvals nvmeof.state" | grep -q -v "host22" + sed -i 's#encryption_key = /var/log/ceph/bad_encryption.key#encryption_key = /etc/ceph/encryption.key#' ceph-nvmeof.conf + docker exec ${NVMEOF_CONTAINER_NAME} rm -f /var/log/ceph/baad_encryption.key /tmp/create_enckey.sh + + echo "ℹ️ use missing encryption key" sed -i 's#encryption_key = /etc/ceph/encryption.key#encryption_key = /etc/ceph/XXXencryption.key#' ceph-nvmeof.conf - container_id=$(docker ps -q -f name=nvmeof) - docker restart ${container_id} + docker restart ${NVMEOF_CONTAINER_NAME} sleep 20 cephnvmf_func subsystem add --subsystem ${NQN}6 --no-group-append set +e @@ -499,14 +539,12 @@ function demo_bdevperf_psk() exit 1 fi set -e + sed -i 's#encryption_key = /etc/ceph/XXXencryption.key#encryption_key = /etc/ceph/encryption.key#' ceph-nvmeof.conf echo "ℹ️ disable key encryption" sed -i '/enable_key_encryption/d' ceph-nvmeof.conf sed -i '/encryption_key/i enable_key_encryption = False' ceph-nvmeof.conf - sed -i '/encryption_key/d' ceph-nvmeof.conf - sed -i '#encryption_key#i #encryption_key = /etc/ceph/encryption.key#' ceph-nvmeof.conf - container_id=$(docker ps -q -f name=nvmeof) - docker restart ${container_id} + docker restart ${NVMEOF_CONTAINER_NAME} sleep 20 cephnvmf_func subsystem add --subsystem ${NQN}5 --no-group-append cephnvmf_func host add --subsystem ${NQN}5 --host-nqn ${NQN}host17 --psk "${PSK_KEY1}" @@ -894,8 +932,7 @@ function demo_bdevperf_dhchap() echo "ℹ️ use invalid encryption key" sed -i '/enable_key_encryption/d' ceph-nvmeof.conf sed -i 's#encryption_key = /etc/ceph/encryption.key#encryption_key = /etc/ceph/XXXencryption.key#' ceph-nvmeof.conf - container_id=$(docker ps -q -f name=nvmeof) - docker restart ${container_id} + docker restart ${NVMEOF_CONTAINER_NAME} sleep 20 cephnvmf_func subsystem add --subsystem ${NQN}4 --dhchap-key "${DHCHAP_KEY10}" --no-group-append set +e @@ -916,8 +953,7 @@ function demo_bdevperf_dhchap() sed -i '/encryption_key/i enable_key_encryption = False' ceph-nvmeof.conf sed -i '/encryption_key/d' ceph-nvmeof.conf sed -i '#encryption_key#i #encryption_key = /etc/ceph/encryption.key#' ceph-nvmeof.conf - container_id=$(docker ps -q -f name=nvmeof) - docker restart ${container_id} + docker restart ${NVMEOF_CONTAINER_NAME} sleep 20 cephnvmf_func subsystem add --subsystem ${NQN}3 --dhchap-key "${DHCHAP_KEY10}" --no-group-append cephnvmf_func host add --subsystem ${NQN}3 --host-nqn ${NQN}host7 --dhchap-key "${DHCHAP_KEY11}" diff --git a/tests/ha/no_subsystems.sh b/tests/ha/no_subsystems.sh index 11c2ed35869..e9927ddbb88 100755 --- a/tests/ha/no_subsystems.sh +++ b/tests/ha/no_subsystems.sh @@ -15,7 +15,7 @@ NQN="nqn.2016-06.io.spdk:cnode1" verify_gw_exists_and_no_subs() { IP=$1 - subs=$(docker compose run -T --rm nvmeof-cli --server-address $IP --server-port 5500 --output stdio --format json get_subsystems) + subs=$(docker compose run -T --rm nvmeof-cli --server-address $IP --server-port 5500 --output stdio --format json subsystem list) echo "show subsystems after del : $subs" if echo "$subs" | grep -q '"subsystems": \[\]'; then echo "The string contains 'subsystems:[]' on GW ip $IP" @@ -42,7 +42,7 @@ NQN="nqn.2016-06.io.spdk:cnode1" docker compose run --rm nvmeof-cli --server-address $ip2 --server-port 5500 listener add --subsystem $NQN --host-name $GW2_NAME --traddr $ip2 --trsvcid 4420 sleep 5 - subs=$(docker compose run -T --rm nvmeof-cli --server-address $ip --server-port 5500 --output stdio --format json get_subsystems) + subs=$(docker compose run -T --rm nvmeof-cli --server-address $ip --server-port 5500 --output stdio --format json subsystem list) echo "subsystems $subs" #test that ana group is Active diff --git a/tests/test_big_omap.py b/tests/test_big_omap.py new file mode 100644 index 00000000000..5716ee1f903 --- /dev/null +++ b/tests/test_big_omap.py @@ -0,0 +1,202 @@ +import pytest +from control.server import GatewayServer +from control.cli import main as cli +from control.cephutils import CephUtils +import grpc +from control.proto import gateway_pb2_grpc as pb2_grpc +import copy +import time +import os + +image_prefix = "mytestdevimage" +pool = "rbd" +subsystem_prefix = "nqn.2016-06.io.spdk:cnode" +host_prefix = "nqn.2014-08.org.nvmexpress:uuid:893a6752-fe9b-ca48-aa93-e4565f3288" +subsystem_count = 128 +namespace_count = 8 +host_count = 12 +anagrpid = "1" +anagrpid2 = "2" +group_name = "group1" +max_subsystems = 1024 +max_namespaces = 5120 +max_hosts = 5000 +update_interval = 300 + + +@pytest.fixture(scope="module") +def two_gateways(config): + """Sets up and tears down two Gateways""" + nameA = "GatewayAA" + nameB = "GatewayBB" + sockA = f"spdk_{nameA}.sock" + sockB = f"spdk_{nameB}.sock" + config.config["gateway-logs"]["log_level"] = "debug" + config.config["gateway"]["group"] = group_name + config.config["gateway"]["max_subsystems"] = f"{max_subsystems}" + config.config["gateway"]["max_namespaces"] = f"{max_namespaces}" + config.config["gateway"]["max_hosts"] = f"{max_hosts}" + config.config["gateway"]["rebalance_period_sec"] = "0" + config.config["gateway"]["state_update_notify"] = "False" + config.config["gateway"]["state_update_interval_sec"] = f"{update_interval}" + addr = config.get("gateway", "addr") + configA = copy.deepcopy(config) + configB = copy.deepcopy(config) + configA.config["gateway"]["name"] = nameA + configA.config["gateway"]["override_hostname"] = nameA + configA.config["spdk"]["rpc_socket_name"] = sockA + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + portA = configA.getint("gateway", "port") + configB.config["gateway"]["name"] = nameB + configB.config["gateway"]["override_hostname"] = nameB + configB.config["spdk"]["rpc_socket_name"] = sockB + portB = portA + 2 + discPortB = configB.getint("discovery", "port") + 1 + configB.config["gateway"]["port"] = str(portB) + configB.config["discovery"]["port"] = str(discPortB) + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + + ceph_utils = CephUtils(config) + with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameA}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameB}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + gatewayA.serve() + gatewayB.serve() + + channelA = grpc.insecure_channel(f"{addr}:{portA}") + pb2_grpc.GatewayStub(channelA) + channelB = grpc.insecure_channel(f"{addr}:{portB}") + pb2_grpc.GatewayStub(channelB) + + yield gatewayA, gatewayB + if gatewayA and gatewayA.server: + gatewayA.server.stop(grace=1) + if gatewayB and gatewayB.server: + gatewayB.server.stop(grace=1) + + +def verify_one_namespace_lb_group(caplog, gw_port, subsys, nsid_to_verify, grp): + caplog.clear() + cli(["--server-port", gw_port, "--format", "json", "namespace", "list", + "--subsystem", subsys, "--nsid", str(nsid_to_verify)]) + assert f'"nsid": {nsid_to_verify},' in caplog.text + assert f'"load_balancing_group": {grp},' in caplog.text + + +def verify_namespaces(caplog, gw_port, subsys, first_nsid, last_nsid, grp): + for ns in range(first_nsid, last_nsid + 1): + verify_one_namespace_lb_group(caplog, gw_port, subsys, ns, grp) + + +def verify_resources(caplog, gw_port, subsys_cnt, ns_cnt, grp): + for subsys_id in range(1, subsys_cnt + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + verify_namespaces(caplog, gw_port, subsys, 1, ns_cnt, grp) + + +def create_resources(caplog, subsys_cnt, host_cnt, ns_cnt, grp): + img_id = 1 + for subsys_id in range(1, subsys_cnt + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsys, "--no-group-append", + "--max-namespaces", f"{2 * ns_cnt}"]) + assert f"Adding subsystem {subsys}: Successful" in caplog.text + for ns_id in range(1, ns_cnt + 1): + caplog.clear() + image = f"{image_prefix}{ns_id}" + cli(["namespace", "add", "--subsystem", subsys, "--rbd-pool", pool, + "--rbd-image", f"{image}{img_id}", "--size", "10MB", "--rbd-create-image", + "--load-balancing-group", grp]) + assert f"Adding namespace {ns_id} to {subsys}: Successful" in caplog.text + img_id += 1 + for host_id in range(1, host_cnt + 1): + caplog.clear() + host_nqn = f"{host_prefix}{host_id:02x}" + cli(["host", "add", "--subsystem", subsys, "--host-nqn", host_nqn]) + assert f"Adding host {host_nqn} to {subsys}: Successful" in caplog.text + + +def create_listeners(caplog, gw_name, gw_port, subsys_cnt, addr, start_port): + port = int(start_port) + for subsys_id in range(1, subsys_cnt + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + caplog.clear() + cli(["--server-port", gw_port, "listener", "add", "--subsystem", subsys, + "--host-name", gw_name, "--traddr", addr, "--trsvcid", str(port)]) + assert f"Adding {subsys} listener at {addr}:{port}: Successful" in caplog.text + port += 1 + + +def change_namespace_lb_group(caplog, gw_port1, gw_port2, subsys, nsid, grp): + cli(["--server-port", gw_port1, "namespace", "change_load_balancing_group", + "--subsystem", subsys, "--nsid", str(nsid), "--load-balancing-group", grp]) + cli(["--server-port", gw_port2, "namespace", "change_load_balancing_group", + "--subsystem", subsys, "--nsid", str(nsid), "--load-balancing-group", grp]) + + +def change_all_namespaces_lb_group(caplog, gw_port1, gw_port2, subsys, grp): + for ns in range(1, namespace_count + 1): + change_namespace_lb_group(caplog, gw_port1, gw_port2, subsys, ns, grp) + + +def change_lb_group_for_all_subsystems(caplog, gw_port1, gw_port2, grp): + for subsys_id in range(1, subsystem_count + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + change_all_namespaces_lb_group(caplog, gw_port1, gw_port2, subsys, grp) + + +def test_big_omap(caplog, two_gateways): + gatewayA, gatewayB = two_gateways + gwA = gatewayA.gateway_rpc + gwB = gatewayB.gateway_rpc + + create_resources(caplog, subsystem_count, host_count, namespace_count, anagrpid) + waitForUpdate = max(int(gwA.config.config["gateway"]["state_update_interval_sec"]), + int(gwB.config.config["gateway"]["state_update_interval_sec"])) + waitForUpdate += 10 + time.sleep(waitForUpdate) + for port in [gwA.config.config["gateway"]["port"], + gwB.config.config["gateway"]["port"]]: + verify_resources(caplog, port, subsystem_count, namespace_count, anagrpid) + + create_listeners(caplog, gwA.host_name, gwA.config.config["gateway"]["port"], + subsystem_count, "127.0.0.1", 3000) + create_listeners(caplog, gwB.host_name, gwB.config.config["gateway"]["port"], + subsystem_count, "127.0.0.1", 4000) + + time.sleep(waitForUpdate) + change_lb_group_for_all_subsystems(caplog, + gwA.config.config["gateway"]["port"], + gwB.config.config["gateway"]["port"], + anagrpid2) + time.sleep(waitForUpdate) + for port in [gwA.config.config["gateway"]["port"], + gwB.config.config["gateway"]["port"]]: + verify_resources(caplog, port, subsystem_count, namespace_count, anagrpid2) + + configB = gwB.config + portB = gwB.config.config["gateway"]["port"] + gatewayB.__exit__(None, None, None) + time.sleep(15) + gatewayB = GatewayServer(configB) + ceph_utils = CephUtils(configB) + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{gatewayB.name}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + gatewayB.serve() + time.sleep(waitForUpdate) + verify_resources(caplog, portB, subsystem_count, namespace_count, anagrpid2) diff --git a/tests/test_cli.py b/tests/test_cli.py index 5ab4d63bb5b..b9ae7a4e1b4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,7 @@ from control.proto import gateway_pb2_grpc as pb2_grpc import os -image = "mytestdevimage" +image = "mytestdevimage1" image2 = "mytestdevimage2" image3 = "mytestdevimage3" image4 = "mytestdevimage4" @@ -35,6 +35,7 @@ image21 = "mytestdevimage21" image22 = "mytestdevimage22" image23 = "mytestdevimage23" +image24 = "mytestdevimage24" pool = "rbd" subsystem = "nqn.2016-06.io.spdk:cnode1" subsystem2 = "nqn.2016-06.io.spdk:cnode2" @@ -47,9 +48,14 @@ subsystem9 = "nqn.2016-06.io.spdk:cnode9" subsystem10 = "nqn.2016-06.io.spdk:cnode10" subsystem11 = "nqn.2016-06.io.spdk:cnode11" +subsystem12 = "nqn.2016-06.io.spdk:cnode12" +subsystem16 = "nqn.2016-06.io.spdk:cnode16" +subsystem17 = "nqn.2016-06.io.spdk:cnode17" +subsystem18 = "nqn.2016-06.io.spdk:cnode18" subsystemX = "nqn.2016-06.io.spdk:cnodeX" discovery_nqn = "nqn.2014-08.org.nvmexpress.discovery" serial = "Ceph00000000000001" +serial2 = "Ceph00000000000002" uuid = "948878ee-c3b2-4d58-a29b-2cff713fc02d" uuid2 = "948878ee-c3b2-4d58-a29b-2cff713fc02e" host_list = ["nqn.2016-06.io.spdk:host1", "*"] @@ -64,6 +70,10 @@ host9 = "nqn.2016-06.io.spdk:host9" host10 = "nqn.2016-06.io.spdk:host10" host11 = "nqn.2016-06.io.spdk:host11" +host12 = "nqn.2016-06.io.spdk:host12" +host13 = "nqn.2016-06.io.spdk:host13" +host14 = "nqn.2016-06.io.spdk:host14" +host15 = "nqn.2016-06.io.spdk:host15" hostxx = "nqn.2016-06.io.spdk:hostXX" nsid = "1" anagrpid = "1" @@ -257,18 +267,23 @@ def test_create_subsystem(self, caplog, gateway): assert "contains invalid characters" in caplog.text caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem, - "--max-namespaces", "2049", "--no-group-append"]) - assert f"The requested max number of namespaces for subsystem {subsystem} (2049) " \ + "--max-namespaces", "3700", "--no-group-append"]) + assert f"Failure creating subsystem {subsystem}: Max namespaces " \ + f"can't be greater than 2048" in caplog.text + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem, + "--max-namespaces", "2039", "--no-group-append"]) + assert f"The requested max number of namespaces for subsystem {subsystem} (2039) " \ f"is greater than the global limit on the number of namespaces (12), " \ f"will continue" in caplog.text assert f"Adding subsystem {subsystem}: Successful" in caplog.text cli(["--format", "json", "subsystem", "list"]) assert f'"serial_number": "{serial}"' not in caplog.text assert f'"nqn": "{subsystem}"' in caplog.text - assert '"max_namespaces": 2049' in caplog.text + assert '"max_namespaces": 2039' in caplog.text caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem, - "--max-namespaces", "2049", "--no-group-append"]) + "--max-namespaces", "2039", "--no-group-append"]) assert f"Failure creating subsystem {subsystem}: Subsystem already exists" in caplog.text caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem2, @@ -509,7 +524,7 @@ def test_add_namespace(self, caplog, gateway): caplog.clear() cli(["namespace", "add", "--subsystem", subsystem, "--rbd-pool", pool, "--rbd-image", image, "--block-size", "1024", - "--load-balancing-group", anagrpid, "--force"]) + "--load-balancing-group", anagrpid, "--rbd-create-image", "--size", "16MB", "--force"]) assert f"Adding namespace 2 to {subsystem}: Successful" in caplog.text caplog.clear() cli(["--format", "json", "namespace", "list", "--subsystem", subsystem, "--nsid", nsid]) @@ -613,6 +628,9 @@ def test_add_namespace_no_auto_visible(self, caplog, gateway): assert "no_auto_visible: True" in caplog.text def test_add_host_to_namespace(self, caplog, gateway): + caplog.clear() + cli(["host", "add", "--subsystem", subsystem, "--host-nqn", host8]) + assert f"Adding host {host8} to {subsystem}: Successful" in caplog.text caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystem, "--nsid", "8", "--host-nqn", host8]) assert f"Adding host {host8} to namespace 8 on {subsystem}: Successful" in caplog.text @@ -623,15 +641,15 @@ def test_add_host_to_namespace(self, caplog, gateway): def test_add_too_many_hosts_to_namespace(self, caplog, gateway): caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystem, "--nsid", "8", - "--host-nqn", host9]) + "--host-nqn", host9, "--force"]) assert f"Adding host {host9} to namespace 8 on {subsystem}: Successful" in caplog.text caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystem, "--nsid", "8", - "--host-nqn", host10]) + "--host-nqn", host10, "--force"]) assert f"Adding host {host10} to namespace 8 on {subsystem}: Successful" in caplog.text caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystem, "--nsid", "8", - "--host-nqn", host11]) + "--host-nqn", host11, "--force"]) assert f"Failure adding host {host11} to namespace 8 on {subsystem}: " \ f"Maximal host count for namespace (3) has already been reached" in caplog.text @@ -641,6 +659,92 @@ def test_add_all_hosts_to_namespace(self, caplog, gateway): assert f"Failure adding host * to namespace 8 on {subsystem}: " \ f"Host NQN can't be \"*\"" in caplog.text + def test_add_host_to_namespace_no_access(self, caplog, gateway): + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem12, "--no-group-append"]) + assert f"Adding subsystem {subsystem12}: Successful" in caplog.text + caplog.clear() + cli(["namespace", "add", "--subsystem", subsystem12, "--rbd-pool", pool, + "--rbd-image", image24, "--size", "16MB", "--rbd-create-image", "--no-auto-visible"]) + assert f"Adding namespace 1 to {subsystem12}: Successful" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert '"hosts": []' in caplog.text + caplog.clear() + cli(["namespace", "add_host", "--subsystem", subsystem12, "--nsid", "1", + "--host-nqn", host12]) + assert f"Failure adding host {host12} to namespace 1 on {subsystem12}: " \ + f"Host is not allowed to access the subsystem" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert '"hosts": []' in caplog.text + assert f'"{host12}"' not in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem12, "--host-nqn", "*"]) + assert f"Subsystem {subsystem12} will be opened to be accessed from any " \ + f"host. This might be a security breach" in caplog.text + assert f"Allowing open host access to {subsystem12}: Successful" in caplog.text + assert f"Open host access to subsystem {subsystem12} might be a " \ + f"security breach" in caplog.text + caplog.clear() + cli(["namespace", "add_host", "--subsystem", subsystem12, "--nsid", "1", + "--host-nqn", host12]) + assert f"Adding host {host12} to namespace 1 on {subsystem12}: Successful" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert f'"{host12}"' in caplog.text + assert '"hosts": []' not in caplog.text + caplog.clear() + cli(["namespace", "del_host", "--subsystem", subsystem12, "--nsid", "1", + "--host-nqn", host12]) + assert f"Deleting host {host12} from namespace 1 on {subsystem12}: " \ + f"Successful" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert f'"{host12}"' not in caplog.text + assert '"hosts": []' in caplog.text + caplog.clear() + cli(["host", "del", "--subsystem", subsystem12, "--host-nqn", "*"]) + assert f"Disabling open host access to {subsystem12}: Successful" in caplog.text + caplog.clear() + cli(["namespace", "add_host", "--subsystem", subsystem12, "--nsid", "1", + "--host-nqn", host12]) + assert f"Failure adding host {host12} to namespace 1 on {subsystem12}: " \ + f"Host is not allowed to access the subsystem" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert '"hosts": []' in caplog.text + assert f'"{host12}"' not in caplog.text + caplog.clear() + cli(["namespace", "add_host", "--subsystem", subsystem12, "--nsid", "1", + "--host-nqn", host12, "--force"]) + assert f"Adding host {host12} to namespace 1 on {subsystem12}: Successful" in caplog.text + assert f"Host {host12} is not allowed to access subsystem {subsystem12} but it will " \ + f"be added to namespace 1 as the \"--force\" parameter " \ + f"was used" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsystem12, "--nsid", "1"]) + assert '"status": 0' in caplog.text + assert f'"subsystem_nqn": "{subsystem12}",' in caplog.text + assert f'"{host12}"' in caplog.text + assert '"hosts": []' not in caplog.text + caplog.clear() + cli(["namespace", "del", "--subsystem", subsystem12, "--nsid", "1"]) + assert f"Deleting namespace 1 from {subsystem12}: Successful" in caplog.text + caplog.clear() + cli(["subsystem", "del", "--subsystem", subsystem12]) + assert f"Deleting subsystem {subsystem12}: Successful" in caplog.text + def test_change_namespace_visibility(self, caplog, gateway): caplog.clear() cli(["namespace", "change_visibility", "--subsystem", subsystem, "--nsid", "8", @@ -765,7 +869,7 @@ def test_add_too_many_namespaces_to_a_subsystem(self, caplog, gateway): cli(["namespace", "add", "--subsystem", subsystem, "--rbd-pool", pool, "--rbd-image", image9, "--nsid", "3000", "--size", "16MB", "--rbd-create-image"]) assert f"Failure adding namespace using ID 3000 to {subsystem}: " \ - f"Requested ID 3000 is bigger than the maximal one (2049)" in caplog.text + f"Requested ID 3000 is bigger than the maximal one (2039)" in caplog.text assert "Received request to delete bdev" in caplog.text caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem5, "--no-group-append", @@ -799,12 +903,6 @@ def test_add_junk_host_to_namespace(self, caplog, gateway): assert f"Failure adding host junk to namespace 8 on {subsystem}: " \ f"Invalid host NQN" in caplog.text - def test_add_host_to_namespace_junk_subsystem(self, caplog, gateway): - caplog.clear() - cli(["namespace", "add_host", "--subsystem", "junk", "--nsid", "8", "--host-nqn", hostxx]) - assert f"Failure adding host {hostxx} to namespace 8 on junk: " \ - f"Can't find subsystem" in caplog.text - def test_add_host_to_namespace_subsystem_not_found(self, caplog, gateway): caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystemX, "--nsid", "8", @@ -890,6 +988,15 @@ def test_del_namespace_host(self, caplog, gateway): f"Host is not found in namespace's host list" in caplog.text def test_add_namespace_multiple_hosts(self, caplog, gateway): + caplog.clear() + cli(["host", "add", "--subsystem", subsystem, "--host-nqn", host8]) + assert f"Failure adding host {host8} to {subsystem}: Host is already added" in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem, "--host-nqn", host9]) + assert f"Adding host {host9} to {subsystem}: Successful" in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem, "--host-nqn", host10]) + assert f"Adding host {host10} to {subsystem}: Successful" in caplog.text caplog.clear() cli(["namespace", "add_host", "--subsystem", subsystem, "--nsid", "9", "--host-nqn", host8, host9, host10]) @@ -905,6 +1012,37 @@ def test_add_namespace_multiple_hosts(self, caplog, gateway): assert f'"{host10}"' in caplog.text assert '"hosts": []' not in caplog.text + def test_list_hosts(self, caplog, gateway): + caplog.clear() + cli(["--format", "json", "host", "list", "--subsystem", subsystem]) + assert '"status": 0,' in caplog.text + assert f'"subsystem_nqn": "{subsystem}",' in caplog.text + assert f'"nqn": "{host8}",' in caplog.text + assert f'"nqn": "{host9}",' in caplog.text + assert f'"nqn": "{host10}",' in caplog.text + assert '"use_psk": true,' not in caplog.text + assert '"use_dhchap": true,' not in caplog.text + assert '"allow_any_host": false' in caplog.text + caplog.clear() + hosts = cli_test(["host", "list", "--subsystem", subsystem]) + assert hosts is not None + assert hosts.status == 0 + assert not hosts.allow_any_host + assert hosts.subsystem_nqn == subsystem + assert len(hosts.hosts) == 3 + assert hosts.hosts[0].nqn in [host8, host9, host10] + assert hosts.hosts[1].nqn in [host8, host9, host10] + assert hosts.hosts[2].nqn in [host8, host9, host10] + assert hosts.hosts[0].nqn != hosts.hosts[1].nqn + assert hosts.hosts[0].nqn != hosts.hosts[2].nqn + assert hosts.hosts[1].nqn != hosts.hosts[2].nqn + assert not hosts.hosts[0].use_psk + assert not hosts.hosts[1].use_psk + assert not hosts.hosts[2].use_psk + assert not hosts.hosts[0].use_dhchap + assert not hosts.hosts[1].use_dhchap + assert not hosts.hosts[2].use_dhchap + def test_del_namespace_multiple_hosts(self, caplog, gateway): caplog.clear() cli(["namespace", "del_host", "--subsystem", subsystem, "--nsid", "9", @@ -920,6 +1058,15 @@ def test_del_namespace_multiple_hosts(self, caplog, gateway): assert f'"{host9}"' not in caplog.text assert f'"{host10}"' not in caplog.text assert '"hosts": []' in caplog.text + caplog.clear() + cli(["host", "del", "--subsystem", subsystem, "--host-nqn", host8]) + assert f"Removing host {host8} access from {subsystem}: Successful" in caplog.text + caplog.clear() + cli(["host", "del", "--subsystem", subsystem, "--host-nqn", host9]) + assert f"Removing host {host9} access from {subsystem}: Successful" in caplog.text + caplog.clear() + cli(["host", "del", "--subsystem", subsystem, "--host-nqn", host10]) + assert f"Removing host {host10} access from {subsystem}: Successful" in caplog.text def test_list_namespace_with_no_hosts(self, caplog, gateway): caplog.clear() @@ -1484,15 +1631,10 @@ def test_remove_not_existing_host(self, caplog, gateway): def remove_host_list(self, caplog): caplog.clear() - cli(["host", "del", "--subsystem", subsystem, - "--host-nqn", "nqn.2016-06.io.spdk:host5", "nqn.2016-06.io.spdk:host6", - "nqn.2016-06.io.spdk:host7"]) - assert f"Removing host nqn.2016-06.io.spdk:host5 access from {subsystem}: " \ - f"Successful" in caplog.text - assert f"Removing host nqn.2016-06.io.spdk:host6 access from {subsystem}: " \ - f"Successful" in caplog.text - assert f"Removing host nqn.2016-06.io.spdk:host7 access from {subsystem}: " \ - f"Successful" in caplog.text + cli(["host", "del", "--subsystem", subsystem, "--host-nqn", host5, host6, host7]) + assert f"Removing host {host5} access from {subsystem}: Successful" in caplog.text + assert f"Removing host {host6} access from {subsystem}: Successful" in caplog.text + assert f"Removing host {host7} access from {subsystem}: Successful" in caplog.text @pytest.mark.parametrize("listener", listener_list) def test_delete_listener_using_wild_hostname_no_force(self, caplog, listener, gateway): @@ -2036,3 +2178,171 @@ def test_listener_bad_ip_adresses(self, caplog, gateway): pass assert "error: IP address :: is not an IPv4 address" in caplog.text assert rc == 2 + + +class TestSubsystemsCache: + def test_subsystems_cache(self, caplog, gateway): + gw, _ = gateway + subs = cli_test(["subsystem", "list"]) + for s in subs.subsystems: + cli(["subsystem", "del", "--subsystem", s.nqn, "--force"]) + subs = cli_test(["subsystem", "list"]) + assert len(subs.subsystems) == 0 + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem16, "--no-group-append"]) + assert f"Adding subsystem {subsystem16}: Successful" in caplog.text + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem17, "--no-group-append"]) + assert f"Adding subsystem {subsystem17}: Successful" in caplog.text + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem18, "--no-group-append", + "--serial-number", serial2]) + assert f"Adding subsystem {subsystem18}: Successful" in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' not in caplog.text + assert f'"nqn": "{subsystem17}",' not in caplog.text + assert f'"nqn": "{subsystem18}",' not in caplog.text + assert f'"serial_number": "{serial2}",' not in caplog.text + # Only after the call to "subsystem list" we should get a fresh cache + caplog.clear() + cli(["--format", "json", "subsystem", "list"]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem16, "--host-nqn", host13]) + assert f"Adding host {host13} to {subsystem16}: Successful" in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem17, "--host-nqn", host14]) + assert f"Adding host {host14} to {subsystem17}: Successful" in caplog.text + caplog.clear() + cli(["host", "add", "--subsystem", subsystem18, "--host-nqn", host15]) + assert f"Adding host {host15} to {subsystem18}: Successful" in caplog.text + host_list_req = pb2.list_hosts_req(subsystem=subsystem16) + caplog.clear() + ret = gw.list_hosts(host_list_req) + assert ret.status == 0 + assert host13 not in caplog.text + assert "Received request to list hosts" in caplog.text + host_list_req = pb2.list_hosts_req(subsystem=subsystem17) + caplog.clear() + ret = gw.list_hosts(host_list_req) + assert ret.status == 0 + assert host14 not in caplog.text + assert "Received request to list hosts" in caplog.text + host_list_req = pb2.list_hosts_req(subsystem=subsystem18) + caplog.clear() + ret = gw.list_hosts(host_list_req) + assert ret.status == 0 + assert host15 not in caplog.text + assert "Received request to list hosts" in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + assert host13 not in caplog.text + assert host14 not in caplog.text + assert host15 not in caplog.text + caplog.clear() + cli(["--format", "json", "host", "list", "--subsystem", subsystem16]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{host13}",' in caplog.text + caplog.clear() + cli(["--format", "json", "host", "list", "--subsystem", subsystem17]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{host14}",' in caplog.text + caplog.clear() + cli(["--format", "json", "host", "list", "--subsystem", subsystem18]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{host15}",' in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + assert host13 not in caplog.text + assert host14 not in caplog.text + assert host15 not in caplog.text + caplog.clear() + cli(["--format", "json", "subsystem", "list"]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + assert host13 in caplog.text + assert host14 in caplog.text + assert host15 in caplog.text + caplog.clear() + cli(["--format", "json", "subsystem", "list", "--serial", serial2]) + assert '"status": 0' in caplog.text + assert f'"nqn": "{subsystem16}",' not in caplog.text + assert f'"nqn": "{subsystem17}",' not in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem18}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + assert host13 in caplog.text + assert host14 in caplog.text + assert host15 in caplog.text + caplog.clear() + cli(["--format", "json", "subsystem", "list"]) + caplog.clear() + cli(["subsystem", "del", "--subsystem", subsystem16]) + assert f"Deleting subsystem {subsystem16}: Successful" in caplog.text + caplog.clear() + cli(["subsystem", "del", "--subsystem", subsystem17]) + assert f"Deleting subsystem {subsystem17}: Successful" in caplog.text + caplog.clear() + cli(["subsystem", "del", "--subsystem", subsystem18]) + assert f"Deleting subsystem {subsystem18}: Successful" in caplog.text + # Subsystems should still be in the cache + caplog.clear() + cli(["get_subsystems"]) + assert '"subsystems": []' not in caplog.text + assert f'"nqn": "{subsystem16}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"nqn": "{subsystem17}",' in caplog.text + assert f'"serial_number": "{serial2}",' in caplog.text + assert host13 in caplog.text + assert host14 in caplog.text + assert host15 in caplog.text + caplog.clear() + cli(["--format", "json", "subsystem", "list"]) + assert '"status": 0' in caplog.text + assert '"subsystems": []' in caplog.text + assert f'"nqn": "{subsystem16}",' not in caplog.text + assert f'"nqn": "{subsystem17}",' not in caplog.text + assert f'"nqn": "{subsystem18}",' not in caplog.text + caplog.clear() + cli(["get_subsystems"]) + assert '"subsystems": []' in caplog.text + assert f'"nqn": "{subsystem16}",' not in caplog.text + assert f'"nqn": "{subsystem17}",' not in caplog.text + assert f'"nqn": "{subsystem17}",' not in caplog.text + assert f'"serial_number": "{serial2}",' not in caplog.text + assert host13 not in caplog.text + assert host14 not in caplog.text + assert host15 not in caplog.text diff --git a/tests/test_cli_change_keys.py b/tests/test_cli_change_keys.py index 9e93e7e13e9..756aff1a353 100644 --- a/tests/test_cli_change_keys.py +++ b/tests/test_cli_change_keys.py @@ -6,6 +6,7 @@ from control.proto import gateway_pb2_grpc as pb2_grpc import copy import time +import os image = "mytestdevimage" pool = "rbd" @@ -38,7 +39,10 @@ def two_gateways(config): configA.config["gateway"]["name"] = nameA configA.config["gateway"]["override_hostname"] = nameA configA.config["spdk"]["rpc_socket_name"] = sockA - configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" portA = configA.getint("gateway", "port") configB.config["gateway"]["name"] = nameB configB.config["gateway"]["override_hostname"] = nameB @@ -47,7 +51,10 @@ def two_gateways(config): discPortB = configB.getint("discovery", "port") + 1 configB.config["gateway"]["port"] = str(portB) configB.config["discovery"]["port"] = str(discPortB) - configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): @@ -85,6 +92,11 @@ def test_change_host_key(caplog, two_gateways): assert f"Adding host {hostnqn2} to {subsystem}: Successful" in caplog.text assert f"Host {hostnqn2} has a DH-HMAC-CHAP key but subsystem {subsystem} has none, " \ f"a unidirectional authentication will be used" in caplog.text + time.sleep(15) + assert f"Received request to add host {hostnqn2} to " \ + f"{subsystem}, context: = 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" portA = configA.getint("gateway", "port") configB.config["gateway"]["name"] = nameB configB.config["gateway"]["override_hostname"] = nameB @@ -45,7 +49,10 @@ def two_gateways(config): discPortB = configB.getint("discovery", "port") + 1 configB.config["gateway"]["port"] = str(portB) configB.config["discovery"]["port"] = str(discPortB) - configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): @@ -84,6 +91,10 @@ def verify_namespaces(caplog, gw_port, subsys, first_nsid, last_nsid, grp): def verify_namespaces_using_get_subsystems(caplog, gw_port, subsys, first_nsid, last_nsid, grp): + caplog.clear() + # call subsystem list first, this will refresh the cache + subsys_info = cli_test(["--server-port", gw_port, "subsystem", "list"]) + assert len(subsys_info.subsystems) == 1 caplog.clear() subsys_info = cli_test(["--server-port", gw_port, "get_subsystems"]) assert len(subsys_info.subsystems) == 1 @@ -124,20 +135,26 @@ def try_change_one_namespace_lb_group_no_listeners(caplog, subsys, nsid_to_chang caplog.clear() cli(["--server-port", "5502", "namespace", "change_load_balancing_group", "--subsystem", subsys, "--nsid", nsid_to_change, "--load-balancing-group", new_group]) - time.sleep(8) - assert "is owned by gateway None so try this command from it" in caplog.text + time.sleep(15) + assert f"Changing load balancing group of namespace {nsid_to_change} in {subsys} " \ + f"to {new_group}: Successful" in caplog.text + assert "try running the command from there" not in caplog.text + assert f"Received manual request to change load balancing group for namespace with ID " \ + f"{nsid_to_change} in {subsys} to {new_group}, context: = 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" portA = configA.getint("gateway", "port") configB.config["gateway"]["name"] = nameB configB.config["gateway"]["override_hostname"] = nameB @@ -37,7 +41,10 @@ def two_gateways(config): discPortB = configB.getint("discovery", "port") + 1 configB.config["gateway"]["port"] = str(portB) configB.config["discovery"]["port"] = str(discPortB) - configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): @@ -75,6 +82,12 @@ def test_change_namespace_visibility(caplog, two_gateways): cli(["--format", "json", "namespace", "list", "--subsystem", subsystem, "--nsid", "1"]) assert '"nsid": 1' in caplog.text assert '"auto_visible": true' in caplog.text + time.sleep(15) + caplog.clear() + cli(["--server-port", "5502", "--format", "json", "namespace", "list", + "--subsystem", subsystem, "--nsid", "1"]) + assert '"nsid": 1' in caplog.text + assert '"auto_visible": true' in caplog.text caplog.clear() cli(["namespace", "change_visibility", "--subsystem", subsystem, "--nsid", "1", "--auto-visible", "no"]) diff --git a/tests/test_dhchap.py b/tests/test_dhchap.py index 280b056bf78..1c130462e48 100644 --- a/tests/test_dhchap.py +++ b/tests/test_dhchap.py @@ -5,6 +5,7 @@ from control.cephutils import CephUtils import grpc import time +import os image = "mytestdevimage" pool = "rbd" @@ -79,7 +80,10 @@ def gateway(config): config.config["gateway"]["override_hostname"] = "GW1" config.config["gateway-logs"]["log_level"] = "debug" config.config["gateway"]["group"] = "" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x01" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x01" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway: @@ -116,7 +120,10 @@ def gateway_encryption_disabled(config): config.config["gateway-logs"]["log_level"] = "debug" config.config["gateway"]["group"] = "" config.config["gateway"]["enable_key_encryption"] = "False" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x02" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x02" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway_encryption_disabled: @@ -153,7 +160,10 @@ def gateway_no_encryption_key(config): config.config["gateway"]["group"] = "" config.config["gateway"]["enable_key_encryption"] = "True" config.config["gateway"]["encryption_key"] = "/etc/ceph/NOencryption.key" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x04" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x04" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway_no_encryption_key: @@ -190,7 +200,10 @@ def gateway_no_key_encryption_disabled(config): config.config["gateway"]["group"] = "" config.config["gateway"]["enable_key_encryption"] = "False" config.config["gateway"]["encryption_key"] = "/etc/ceph/NOencryption.key" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x08" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x08" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway_no_key_encryption_disabled: diff --git a/tests/test_grpc.py b/tests/test_grpc.py index 882b4ab61dc..55bb65bf13b 100644 --- a/tests/test_grpc.py +++ b/tests/test_grpc.py @@ -86,7 +86,7 @@ def test_create_get_subsys(caplog, config): # add host to the first namespace caplog.clear() cli(["namespace", "add_host", "--subsystem", f"{subsystem_prefix}0", - "--nsid", "1", "--host-nqn", f"{host_prefix}0"]) + "--nsid", "1", "--host-nqn", f"{host_prefix}0", "--force"]) assert "Failure adding host" not in caplog.text caplog.clear() @@ -117,7 +117,7 @@ def test_create_get_subsys(caplog, config): f"{subsystem_prefix}0, R/W IOs per second: 2000 " \ f"Read megabytes per second: 5" in caplog.text assert f"Received request to add host {host_prefix}0 to namespace 1 on " \ - f"{subsystem_prefix}0, context: None" in caplog.text + f"{subsystem_prefix}0, force: True, context: None" in caplog.text caplog.clear() cli(["--format", "plain", "subsystem", "list"]) assert "Exception" not in caplog.text diff --git a/tests/test_multi_gateway.py b/tests/test_multi_gateway.py index 9f530d02328..ff39326c742 100644 --- a/tests/test_multi_gateway.py +++ b/tests/test_multi_gateway.py @@ -105,7 +105,7 @@ def test_multi_gateway_coordination(config, image, conn): # Watch/Notify if update_notify: - time.sleep(1) + time.sleep(15) listB = json.loads(json_format.MessageToJson( stubB.list_subsystems(list_subsystems_req), preserving_proto_field_name=True, including_default_value_fields=True))['subsystems'] @@ -124,7 +124,7 @@ def test_multi_gateway_coordination(config, image, conn): assert nsListB[0]["rbd_pool_name"] == pool # Periodic update - time.sleep(update_interval_sec + 1) + time.sleep(update_interval_sec + 15) listB = json.loads(json_format.MessageToJson( stubB.list_subsystems(list_subsystems_req), preserving_proto_field_name=True, including_default_value_fields=True))['subsystems'] diff --git a/tests/test_ns_limit.py b/tests/test_ns_limit.py new file mode 100644 index 00000000000..7750de3e6e7 --- /dev/null +++ b/tests/test_ns_limit.py @@ -0,0 +1,195 @@ +import pytest +from control.server import GatewayServer +from control.cli import main as cli +from control.cli import main_test as cli_test +from control.cephutils import CephUtils +import grpc +import copy +import random +import os +import time +from control.proto import gateway_pb2_grpc as pb2_grpc + +image_prefix = "testimage" +image_count = 0 +pool = "rbd" +subsystem_prefix = "nqn.2016-06.io.spdk:cnode" +subsystem_count = 8 +namespace_count = 2048 +namespace_delete_percentage = 25 +group_name = "group1" +update_interval = 300 + + +@pytest.fixture(scope="module") +def two_gateways(config): + """Sets up and tears down two Gateways""" + nameA = "GatewayAA" + nameB = "GatewayBB" + sockA = f"spdk_{nameA}.sock" + sockB = f"spdk_{nameB}.sock" + addr = config.get("gateway", "addr") + config.config["gateway-logs"]["log_level"] = "debug" + config.config["gateway"]["group"] = group_name + config.config["gateway"]["state_update_interval_sec"] = str(update_interval) + configA = copy.deepcopy(config) + configB = copy.deepcopy(config) + configA.config["gateway"]["name"] = nameA + configA.config["gateway"]["override_hostname"] = nameA + configA.config["spdk"]["rpc_socket_name"] = sockA + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + portA = configA.getint("gateway", "port") + + configB.config["gateway"]["name"] = nameB + configB.config["gateway"]["override_hostname"] = nameB + configB.config["spdk"]["rpc_socket_name"] = sockB + portB = portA + 2 + discPortB = configB.getint("discovery", "port") + 1 + configB.config["gateway"]["port"] = str(portB) + configB.config["discovery"]["port"] = str(discPortB) + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + + ceph_utils = CephUtils(config) + with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameA}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameB}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + gatewayA.serve() + gatewayB.serve() + + # Bind the client and Gateway + channelA = grpc.insecure_channel(f"{addr}:{portA}") + pb2_grpc.GatewayStub(channelA) + channelB = grpc.insecure_channel(f"{addr}:{portB}") + pb2_grpc.GatewayStub(channelB) + + yield gatewayA, gatewayB + + # Stop gateway + gatewayA.gateway_rpc.gateway_state.delete_state() + gatewayB.gateway_rpc.gateway_state.delete_state() + gatewayA.server.stop(grace=1) + gatewayB.server.stop(grace=1) + + +def get_image_name(): + global image_count + image = f"{image_prefix}_{image_count}" + image_count += 1 + return image + + +def create_namespaces_for_subsystem(caplog, subsys, ns_cnt): + for i in range(1, ns_cnt + 1): + caplog.clear() + image = get_image_name() + cli(["namespace", "add", "--subsystem", subsys, "--rbd-pool", pool, + "--rbd-image", image, "--size", "10MB", "--rbd-create-image"]) + assert "Adding namespace " in caplog.text + assert f" to {subsys}: Successful" in caplog.text + assert "Failure adding namespace" not in caplog.text + + +def create_namespaces_for_all_subsystems(caplog, subsys_cnt, ns_per_subsys): + for subsys_id in range(1, subsys_cnt + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + caplog.clear() + cli(["subsystem", "add", "--subsystem", subsys, "--no-group-append"]) + assert f"Adding subsystem {subsys}: Successful" in caplog.text + create_namespaces_for_subsystem(caplog, subsys, ns_per_subsys) + + +def delete_one_namespace(caplog, subsys, nsid): + caplog.clear() + cli(["namespace", "del", "--subsystem", subsys, "--nsid", str(nsid)]) + assert f"Deleting namespace {nsid} from {subsys}: Successful" in caplog.text + + +def delete_namespaces_from_subsystem(caplog, subsys, subsys_ns_cnt, ns_count_to_delete): + assert ns_count_to_delete <= subsys_ns_cnt + nsids_to_delete = [] + random.seed() + for i in range(1, ns_count_to_delete + 1): + while True: + ns_to_delete = random.randint(1, subsys_ns_cnt) + if ns_to_delete not in nsids_to_delete: + nsids_to_delete.insert(0, ns_to_delete) + break + assert len(nsids_to_delete) == ns_count_to_delete + for nsid in nsids_to_delete: + delete_one_namespace(caplog, subsys, nsid) + + +def set_qos_for_subsystem_namespaces(caplog, subsys, subsys_ns_cnt): + for ns in range(1, subsys_ns_cnt + 1): + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsys, "--nsid", str(ns)]) + assert f'"nsid": {ns},' in caplog.text + assert '"rw_ios_per_second": "0"' in caplog.text + assert '"rw_mbytes_per_second": "0"' in caplog.text + assert '"r_mbytes_per_second": "0"' in caplog.text + assert '"w_mbytes_per_second": "0"' in caplog.text + caplog.clear() + cli(["namespace", "set_qos", "--subsystem", subsys, "--nsid", str(ns), + "--rw-ios-per-second", "2000"]) + assert f"Setting QOS limits of namespace {ns} in {subsys}: Successful" in caplog.text + caplog.clear() + cli(["--format", "json", "namespace", "list", "--subsystem", subsys, "--nsid", str(ns)]) + assert f'"nsid": {ns},' in caplog.text + assert '"rw_ios_per_second": "2000"' in caplog.text + assert '"rw_mbytes_per_second": "0"' in caplog.text + assert '"r_mbytes_per_second": "0"' in caplog.text + assert '"w_mbytes_per_second": "0"' in caplog.text + + +def verify_namespace_count(caplog, port, desired_ns_count): + caplog.clear() + ns_list = cli_test(["--server-port", port, "--format", "json", "namespace", "list"]) + assert len(ns_list.namespaces) == desired_ns_count + + +def test_ns_limit(caplog, two_gateways): + gwA, gwB = two_gateways + portA = gwA.config.config["gateway"]["port"] + portB = gwB.config.config["gateway"]["port"] + waitForUpdate = max(int(gwA.config.config["gateway"]["state_update_interval_sec"]), + int(gwB.config.config["gateway"]["state_update_interval_sec"])) + waitForUpdate += 10 + ns_per_subsys = namespace_count // subsystem_count + assert ns_per_subsys > 0 + assert ns_per_subsys * subsystem_count == namespace_count + create_namespaces_for_all_subsystems(caplog, subsystem_count, ns_per_subsys) + time.sleep(waitForUpdate) + verify_namespace_count(caplog, portA, namespace_count) + verify_namespace_count(caplog, portB, namespace_count) + namespace_count_to_delete = (ns_per_subsys * namespace_delete_percentage) // 100 + assert namespace_count_to_delete > 0 + assert namespace_count_to_delete < ns_per_subsys + for subsys_id in range(1, subsystem_count + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + delete_namespaces_from_subsystem(caplog, subsys, ns_per_subsys, namespace_count_to_delete) + time.sleep(waitForUpdate) + verify_namespace_count(caplog, portA, + namespace_count - (namespace_count_to_delete * subsystem_count)) + verify_namespace_count(caplog, portB, + namespace_count - (namespace_count_to_delete * subsystem_count)) + for subsys_id in range(1, subsystem_count + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + create_namespaces_for_subsystem(caplog, subsys, namespace_count_to_delete) + time.sleep(waitForUpdate) + verify_namespace_count(caplog, portA, namespace_count) + verify_namespace_count(caplog, portB, namespace_count) + for subsys_id in range(1, subsystem_count + 1): + subsys = f"{subsystem_prefix}{subsys_id}" + set_qos_for_subsystem_namespaces(caplog, subsys, ns_per_subsys) diff --git a/tests/test_omap_lock.py b/tests/test_omap_lock.py index 020e0be6bdd..a4f1e0095ac 100644 --- a/tests/test_omap_lock.py +++ b/tests/test_omap_lock.py @@ -3,6 +3,7 @@ import grpc import json import time +import os from google.protobuf import json_format from control.server import GatewayServer from control.cephutils import CephUtils @@ -31,7 +32,10 @@ def setup_config(config, gw1_name, gw2_name, gw_group, update_notify, update_int configA.config["gateway"]["omap_file_lock_duration"] = str(lock_duration) configA.config["gateway"]["enable_spdk_discovery_controller"] = "True" configA.config["spdk"]["rpc_socket_name"] = sock1_name - configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" configB = copy.deepcopy(configA) portA = configA.getint("gateway", "port") + port_inc configA.config["gateway"]["port"] = str(portA) @@ -40,7 +44,10 @@ def setup_config(config, gw1_name, gw2_name, gw_group, update_notify, update_int configB.config["gateway"]["override_hostname"] = gw2_name configB.config["gateway"]["port"] = str(portB) configB.config["spdk"]["rpc_socket_name"] = sock2_name - configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" return configA, configB @@ -155,23 +162,32 @@ def create_resource_by_index(stub, i, caplog): if caplog is not None: assert f"create_subsystem {subsystem}: True" in caplog.text assert f"Failure creating subsystem {subsystem}" not in caplog.text + caplog.clear() namespace_req = pb2.namespace_add_req(subsystem_nqn=subsystem, rbd_pool_name=pool, rbd_image_name=image, block_size=4096, create_image=True, size=16 * 1024 * 1024, force=True) ret_namespace = stub.namespace_add(namespace_req) assert ret_namespace.status == 0 + assert ret_namespace.nsid > 0 + if caplog is not None: + assert f"subsystem_add_ns: {ret_namespace.nsid}" in caplog.text + assert "Failure adding namespace " not in caplog.text + caplog.clear() hostnqn = build_host_nqn(i) host_req = pb2.add_host_req(subsystem_nqn=subsystem, host_nqn=hostnqn) ret_host = stub.add_host(host_req) assert ret_host.status == 0 + if caplog is not None: + assert f"add_host {hostnqn}: True" in caplog.text + assert f"Failure adding host {hostnqn} to {subsystem}" not in caplog.text + caplog.clear() host_req = pb2.add_host_req(subsystem_nqn=subsystem, host_nqn="*") ret_host = stub.add_host(host_req) assert ret_host.status == 0 if caplog is not None: - assert f"add_host {hostnqn}: True" in caplog.text assert "add_host *: True" in caplog.text assert f"Failure allowing open host access to {subsystem}" not in caplog.text - assert f"Failure adding host {hostnqn} to {subsystem}" not in caplog.text + caplog.clear() def check_resource_by_index(i, subsys_list, hosts_info): @@ -280,9 +296,12 @@ def test_trying_to_lock_twice(config, image, conn_lock_twice, caplog): caplog.clear() stubA, stubB = conn_lock_twice - with pytest.raises(Exception): - create_resource_by_index(stubA, 100000, None) - create_resource_by_index(stubB, 100001, None) + try: + with pytest.raises(Exception): + create_resource_by_index(stubA, 100000, None) + create_resource_by_index(stubB, 100001, None) + except SystemExit: + pass assert "OMAP file unlock was disabled, will not unlock file" in caplog.text assert "The OMAP file is locked, will try again in" in caplog.text assert "Unable to lock OMAP file" in caplog.text @@ -299,7 +318,6 @@ def test_multi_gateway_concurrent_changes(config, image, conn_concurrent, caplog create_resource_by_index(stubA, i, caplog) else: create_resource_by_index(stubB, i, caplog) - assert "failed" not in caplog.text.lower() listener_req = pb2.create_listener_req(nqn=f"{subsystem_prefix}0", host_name=gwA.host_name, adrfam="ipv4", @@ -311,7 +329,7 @@ def test_multi_gateway_concurrent_changes(config, image, conn_concurrent, caplog f"{subsystem_prefix}0 at 127.0.0.1:5001" in caplog.text assert "create_listener: True" in caplog.text - timeout = 15 # Maximum time to wait (in seconds) + timeout = 30 # Maximum time to wait (in seconds) start_time = time.time() expected_warning_other_gw = f"Listener not created as gateway's host name {gwB.host_name} " \ f"differs from requested host {gwA.host_name}" @@ -370,6 +388,7 @@ def test_multi_gateway_listener_update(config, image, conn_concurrent, caplog): assert f"Received request to create {gwA.host_name} TCP ipv4 listener for " \ f"{subsystem} at 127.0.0.1:5101" in caplog.text assert "create_listener: True" in caplog.text + time.sleep(20) caplog.clear() listenerB_req = pb2.create_listener_req(nqn=subsystem, host_name=gwB.host_name, diff --git a/tests/test_omap_no_read_lock.py b/tests/test_omap_no_read_lock.py new file mode 100644 index 00000000000..3268d4ce845 --- /dev/null +++ b/tests/test_omap_no_read_lock.py @@ -0,0 +1,66 @@ +import pytest +from control.server import GatewayServer +from control.cephutils import CephUtils +import grpc +from control.proto import gateway_pb2_grpc as pb2_grpc +import time + +pool = "rbd" +group_name = "GROUPNAME" + + +@pytest.fixture(scope="module") +def gateway(config): + """Sets up and tears down Gateway""" + + config.config["gateway"]["group"] = group_name + addr = config.get("gateway", "addr") + port = config.getint("gateway", "port") + config.config["gateway"]["omap_file_lock_on_read"] = "False" + config.config["gateway-logs"]["log_level"] = "debug" + ceph_utils = CephUtils(config) + + with GatewayServer(config) as gateway: + + # Start gateway + gateway.gw_logger_object.set_log_level("debug") + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{gateway.name}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + gateway.serve() + + # Bind the client and Gateway + channel = grpc.insecure_channel(f"{addr}:{port}") + stub = pb2_grpc.GatewayStub(channel) + yield gateway.gateway_rpc, stub + + # Stop gateway + gateway.server.stop(grace=1) + gateway.gateway_rpc.gateway_state.delete_state() + + +def test_no_read_lock(caplog, gateway): + gw, _ = gateway + lookfor = "Will not lock OMAP for read, this might cause using an inconsistent state when " \ + "big OMAP file are used" + found = 0 + time.sleep(10) + for oneline in caplog.get_records("setup"): + if oneline.message == lookfor: + found += 1 + assert found == 1 + caplog.clear() + gw.gateway_state.omap.get_state() + assert "Locked OMAP file before reading its content" not in caplog.text + assert "Released OMAP file lock after reading content" not in caplog.text + gw.rpc_lock.acquire() + caplog.clear() + gw.omap_lock.lock_omap() + assert "Locked OMAP exclusive" in caplog.text + assert "Locked OMAP shared" not in caplog.text + caplog.clear() + gw.gateway_state.omap.get_state() + assert "The OMAP file is locked, will try again in" not in caplog.text + gw.omap_lock.unlock_omap() + gw.rpc_lock.release() diff --git a/tests/test_omap_read_lock.py b/tests/test_omap_read_lock.py new file mode 100644 index 00000000000..fe14c9524b5 --- /dev/null +++ b/tests/test_omap_read_lock.py @@ -0,0 +1,158 @@ +import pytest +from control.server import GatewayServer +from control.cephutils import CephUtils +from control.state import OmapLock +import grpc +from control.proto import gateway_pb2_grpc as pb2_grpc +import copy +import time +import os +import rados + +pool = "rbd" +host_prefix = "nqn.2014-08.org.nvmexpress:uuid:893a6752-fe9b-ca48-aa93-e4565f3288" + + +@pytest.fixture(scope="module") +def two_gateways(config): + """Sets up and tears down two Gateways""" + grp = "group2" + nameA = "GatewayAA" + nameB = "GatewayBB" + sockA = f"spdk_{nameA}.sock" + sockB = f"spdk_{nameB}.sock" + config.config["gateway-logs"]["log_level"] = "debug" + config.config["gateway"]["group"] = grp + config.config["gateway"]["max_subsystems"] = "1024" + config.config["gateway"]["max_namespaces"] = "5120" + config.config["gateway"]["max_hosts"] = "5000" + config.config["gateway"]["rebalance_period_sec"] = "0" + config.config["gateway"]["state_update_notify"] = "False" + config.config["gateway"]["state_update_interval_sec"] = "300" + addr = config.get("gateway", "addr") + configA = copy.deepcopy(config) + configB = copy.deepcopy(config) + configA.config["gateway"]["name"] = nameA + configA.config["gateway"]["override_hostname"] = nameA + configA.config["spdk"]["rpc_socket_name"] = sockA + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x02" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + portA = configA.getint("gateway", "port") + configB.config["gateway"]["name"] = nameB + configB.config["gateway"]["override_hostname"] = nameB + configB.config["spdk"]["rpc_socket_name"] = sockB + portB = portA + 2 + discPortB = configB.getint("discovery", "port") + 1 + configB.config["gateway"]["port"] = str(portB) + configB.config["discovery"]["port"] = str(discPortB) + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" + + ceph_utils = CephUtils(config) + with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB): + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameA}", "pool": "{pool}", ' + f'"group": "{grp}"' + "}" + ) + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{nameB}", "pool": "{pool}", ' + f'"group": "{grp}"' + "}" + ) + gatewayA.serve() + gatewayB.serve() + + channelA = grpc.insecure_channel(f"{addr}:{portA}") + pb2_grpc.GatewayStub(channelA) + channelB = grpc.insecure_channel(f"{addr}:{portB}") + pb2_grpc.GatewayStub(channelB) + + yield gatewayA.gateway_rpc, gatewayB.gateway_rpc + gatewayA.gateway_rpc.gateway_state.delete_state() + gatewayB.gateway_rpc.gateway_state.delete_state() + gatewayA.server.stop(grace=1) + gatewayB.server.stop(grace=1) + + +def test_mixing_locks(caplog, two_gateways): + gwA, gwB = two_gateways + + caplog.clear() + gwA.gateway_state.omap.get_state() + assert "Locked OMAP file before reading its content" in caplog.text + assert "Released OMAP file lock after reading content" in caplog.text + gwA.rpc_lock.acquire() + gwB.rpc_lock.acquire() + caplog.clear() + gwA.omap_lock.lock_omap() + assert "Locked OMAP exclusive" in caplog.text + assert "Locked OMAP shared" not in caplog.text + time.sleep(19) # A little less than omap_file_lock_duration + caplog.clear() + gwB.gateway_state.omap.get_state() + assert "The OMAP file is locked, will try again in" in caplog.text + assert "Succeeded to lock OMAP file (shared) after" in caplog.text + caplog.clear() + with pytest.raises(rados.ObjectNotFound): + gwA.omap_lock.unlock_omap() + assert "OMAP was unlocked" not in caplog.text + assert "No such lock, the exclusive lock might have expired" in caplog.text + OmapLock.reset_lock_markers() + time.sleep(25) + caplog.clear() + gwA.omap_lock.lock_omap(False, False, 1) + assert "Locked OMAP shared" in caplog.text + caplog.clear() + gwB.omap_lock.lock_omap(False, False, 2) + assert "Locked OMAP shared" in caplog.text + assert "We already locked the OMAP file" not in caplog.text + caplog.clear() + gwA.omap_lock.unlock_omap(False, 1) + assert "OMAP was unlocked" in caplog.text + caplog.clear() + gwB.omap_lock.unlock_omap(False, 2) + assert "OMAP was unlocked" in caplog.text + caplog.clear() + with pytest.raises(rados.ObjectNotFound): + gwA.omap_lock.unlock_omap(False, 1) + assert "OMAP was unlocked" not in caplog.text + assert "No such lock, the shared lock might have expired" in caplog.text + OmapLock.reset_lock_markers() + time.sleep(25) + caplog.clear() + gwA.omap_lock.lock_omap() + assert "Locked OMAP exclusive" in caplog.text + caplog.clear() + try: + gwA.omap_lock.lock_omap() + except RuntimeError as ex: + assert str(ex) == "An attempt to lock OMAP exclusively twice from the same thread" + pass + caplog.clear() + gwA.omap_lock.unlock_omap() + assert "OMAP was unlocked" in caplog.text + caplog.clear() + with pytest.raises(rados.ObjectNotFound): + gwA.omap_lock.unlock_omap() + assert "OMAP was unlocked" not in caplog.text + assert "No such lock, the exclusive lock might have expired" in caplog.text + OmapLock.reset_lock_markers() + time.sleep(25) + caplog.clear() + gwA.omap_lock.lock_omap() + assert "Locked OMAP exclusive" in caplog.text + gotFileExists = False + caplog.clear() + try: + gwA.omap_lock.lock_omap(False, False, 3) + except FileExistsError: + gotFileExists = True + assert "No need to lock OMAP for read as we already have it locked for write" in caplog.text + assert "Locked OMAP shared" not in caplog.text + assert gotFileExists + gwA.omap_lock.unlock_omap() + gwA.rpc_lock.release() + gwB.rpc_lock.release() diff --git a/tests/test_omap_read_lock_ignore_errors.py b/tests/test_omap_read_lock_ignore_errors.py new file mode 100644 index 00000000000..aa7f1ac4ddb --- /dev/null +++ b/tests/test_omap_read_lock_ignore_errors.py @@ -0,0 +1,60 @@ +import pytest +from control.server import GatewayServer +from control.cephutils import CephUtils +import grpc +from control.proto import gateway_pb2_grpc as pb2_grpc +import time + +pool = "rbd" +group_name = "GROUPNAME" + + +@pytest.fixture(scope="module") +def gateway(config): + """Sets up and tears down Gateway""" + + config.config["gateway"]["group"] = group_name + addr = config.get("gateway", "addr") + port = config.getint("gateway", "port") + config.config["gateway"]["omap_file_ignore_unlock_errors"] = "True" + config.config["gateway-logs"]["log_level"] = "debug" + ceph_utils = CephUtils(config) + + with GatewayServer(config) as gateway: + + # Start gateway + gateway.gw_logger_object.set_log_level("debug") + ceph_utils.execute_ceph_monitor_command( + "{" + f'"prefix":"nvme-gw create", "id": "{gateway.name}", "pool": "{pool}", ' + f'"group": "{group_name}"' + "}" + ) + gateway.serve() + + # Bind the client and Gateway + channel = grpc.insecure_channel(f"{addr}:{port}") + pb2_grpc.GatewayStub(channel) + yield gateway.gateway_rpc + + # Stop gateway + gateway.server.stop(grace=1) + gateway.gateway_rpc.gateway_state.delete_state() + + +def test_ignore_unlock_errors(caplog, gateway): + gw = gateway + lookfor = "OMAP unlock errors will be ignored, the gateway will continue" + found = 0 + time.sleep(10) + for oneline in caplog.get_records("setup"): + if oneline.message == lookfor: + found += 1 + assert found == 1 + caplog.clear() + gw.rpc_lock.acquire() + gw.omap_lock.lock_omap() + assert "Locked OMAP exclusive" in caplog.text + assert "Locked OMAP shared" not in caplog.text + time.sleep(25) # A little more than omap_file_lock_duration + caplog.clear() + gw.omap_lock.unlock_omap() + assert "No such lock, the exclusive lock might have expired" in caplog.text diff --git a/tests/test_psk.py b/tests/test_psk.py index 2cf320b2a29..ea57a85da07 100644 --- a/tests/test_psk.py +++ b/tests/test_psk.py @@ -5,6 +5,7 @@ from control.cephutils import CephUtils import grpc import time +import os image = "mytestdevimage" pool = "rbd" @@ -63,7 +64,10 @@ def gateway(config): config.config["gateway"]["override_hostname"] = "GW1" config.config["gateway-logs"]["log_level"] = "debug" config.config["gateway"]["group"] = "" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway: @@ -100,7 +104,10 @@ def gateway_no_encryption_key(config): config.config["gateway-logs"]["log_level"] = "debug" config.config["gateway"]["group"] = "" config.config["gateway"]["encryption_key"] = "/etc/ceph/NOencryption.key" - config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + config.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + config.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with GatewayServer(config) as gateway_no_encryption_key: diff --git a/tests/test_server.py b/tests/test_server.py index d904c7d9567..10069d473d1 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -23,7 +23,7 @@ def validate_exception(self, e): pid = int(m.group(1)) code = int(m.group(2)) assert pid > 0 - assert code + assert code == 0 or code def remove_core_files(self, directory_path): # List all files starting with "core." in the core directory @@ -69,6 +69,33 @@ def test_no_coredumps_on_gracefull_shutdown(self): time.sleep(10) # let it dump self.assert_no_core_files(self.core_dir) + def test_discovery_exit(self): + """Tests discovery service sub process exiting.""" + test_config = copy.deepcopy(self.config) + signals = [signal.SIGABRT, signal.SIGTERM, signal.SIGKILL, signal.SIGINT] + + for sig in signals: + with self.assertRaises(SystemExit) as cm: + with GatewayServer(test_config) as gateway: + gateway.set_group_id(0) + gateway.serve() + + # Give the gateway some time to start + time.sleep(17) + + # Send signal to the discovery service process + assert gateway.discovery_pid + os.kill(gateway.discovery_pid, sig) + + # Block on running keep alive ping + gateway.keep_alive() + + # Assert error exit code + self.validate_exception(cm.exception) + + # Clean up cores + self.remove_core_files(self.core_dir) + def test_monc_exit(self): """Tests monitor client sub process abort.""" config_monc_abort = copy.deepcopy(self.config) diff --git a/tests/test_state.py b/tests/test_state.py index f46eae2c5bc..97284b7ed96 100644 --- a/tests/test_state.py +++ b/tests/test_state.py @@ -22,7 +22,7 @@ def local_state(): @pytest.fixture def omap_state(config): """Sets up and tears down OMAP state object.""" - omap = OmapGatewayState(config, "test") + omap = OmapGatewayState(config, None, "test") omap.delete_state() yield omap omap.delete_state() @@ -49,7 +49,7 @@ def test_state_polling_update(config, ioctx, local_state, omap_state): update_counter = 0 - def _state_polling_update(update, is_add_req): + def _state_polling_update(update, is_add_req, break_interval): nonlocal update_counter update_counter += 1 for k, v in update.items(): @@ -109,9 +109,8 @@ def test_state_notify_update(config, ioctx, local_state, omap_state): update_counter = 0 notify_event = threading.Event() # Event to signal when notify is called - def _state_notify_update(update, is_add_req): + def _state_notify_update(update, is_add_req, break_interval): nonlocal update_counter - nonlocal notify_event update_counter += 1 elapsed = time.time() - start assert elapsed < update_interval_sec @@ -140,7 +139,7 @@ def _state_notify_update(update, is_add_req): version = 1 update_interval_sec = 10 state = GatewayStateHandler(config, local_state, omap_state, - _state_notify_update, "test") + _state_notify_update, None, "test") key = "namespace_test" state.update_interval = update_interval_sec state.use_notify = True diff --git a/tests/test_subsys_grp_name_append.py b/tests/test_subsys_grp_name_append.py index ccbc37535a5..1d2fab64d20 100755 --- a/tests/test_subsys_grp_name_append.py +++ b/tests/test_subsys_grp_name_append.py @@ -5,6 +5,7 @@ import grpc from control.proto import gateway_pb2_grpc as pb2_grpc import copy +import os image = "mytestdevimage" pool = "rbd" @@ -31,7 +32,10 @@ def two_gateways(config): configA.config["gateway"]["name"] = nameA configA.config["gateway"]["override_hostname"] = nameA configA.config["spdk"]["rpc_socket_name"] = sockA - configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + if os.cpu_count() >= 4: + configA.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x03" + else: + configA.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" portA = configA.getint("gateway", "port") configB.config["gateway"]["name"] = nameB configB.config["gateway"]["override_hostname"] = nameB @@ -40,7 +44,10 @@ def two_gateways(config): discPortB = configB.getint("discovery", "port") + 1 configB.config["gateway"]["port"] = str(portB) configB.config["discovery"]["port"] = str(discPortB) - configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + if os.cpu_count() >= 4: + configB.config["spdk"]["tgt_cmd_extra_args"] = "-m 0x0C" + else: + configB.config["spdk"]["tgt_cmd_extra_args"] = "--disable-cpumask-locks" ceph_utils = CephUtils(config) with (GatewayServer(configA) as gatewayA, GatewayServer(configB) as gatewayB):