diff --git a/.github/actions/make_release/entrypoint.sh b/.github/actions/make_release/entrypoint.sh index 957d8b367c0..09a02b6c738 100755 --- a/.github/actions/make_release/entrypoint.sh +++ b/.github/actions/make_release/entrypoint.sh @@ -1,5 +1,7 @@ #!/bin/bash -l +set -e + # Only need to do any of this if the version has been updated # NOTE: The diff-index with HEAD^ implies that the TAG # must be updated in the last commit. But version update diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 6e174f0b74a..058686eb21a 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -15,6 +15,8 @@ jobs: name: Create Release if: github.repository == 'daos-stack/daos' runs-on: ubuntu-latest + permissions: + contents: write steps: - uses: actions/checkout@v4 with: diff --git a/TAG b/TAG index c884d2e32af..ca1fdbfd4c1 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.4-rc1 +2.6.4-rc2 diff --git a/ci/provisioning/post_provision_config.sh b/ci/provisioning/post_provision_config.sh index 14980c86a03..c6d2e26ffdc 100755 --- a/ci/provisioning/post_provision_config.sh +++ b/ci/provisioning/post_provision_config.sh @@ -25,10 +25,54 @@ source ci/provisioning/post_provision_config_common_functions.sh # shellcheck disable=SC1091 source ci/junit.sh +# This script needs to be able to run outside of CI for testing. +# Before running the script, environment variables may be needed for +# the specific site. : "${MLNX_VER_NUM:=24.04-0.6.6.0}" -: "${DISTRO:=EL_7}" +# This is tangled and needs a better fix as it has DISTRO being passed +# as EL_8 for EL_9, yet other places expect DISTRO to really be EL_8 and +# not EL_9. + +# As caller has to be fixed later first set defaults for use outside of CI +: "${DISTRO:=unknown}" + +# When running outside of CI, we can assume that this is run on the target +# system, and if DISTRO is unknown, we can look it up. +if [[ "$DISTRO" == unknown ]]; then + # shellcheck disable=SC1091 + source /etc/os-release + : "${ID_LIKE:=rhel}" + : "${ID:=unknown}" + : "${VERSION_ID:=8}" + prefix="EL" + version="${VERSION_ID%%.*}" + if [[ "$ID_LIKE" == *suse* ]]; then + prefix="LEAP" + elif [[ "$ID" == *ubuntu* ]]; then + prefix="UBUNTU" + version="$VERSION_ID" + fi + DISTRO="${prefix}_${version}" +fi + +# Helper scripts should be distro family specific not distro version specific +FAMILY="${DISTRO%%_*}" + +# NODELIST is all the nodes in a CI cluster comma separated - do not use here. +# NODESTRING is only the nodes in the requested CI cluster. +: "${NODESTRING:=localhost}" + +: "${COMMIT_MESSAGE:=$(git log -1 --pretty=%B)}" +: "${ARTIFACTORY_URL:=}" +: "${REPO_FILE_URL:=}" +if [ -n "$ARTIFACTORY_URL" ] && [ -z "$REPO_FILE_URL" ]; then + REPO_FILE_URL="$ARTIFACTORY_URL/repo-files/" +fi + +# CI user can be any user that is not expected to be on the test systems. +: "${CI_USER:=jenkins}" retry_cmd 300 clush -B -S -l root -w "$NODESTRING" -c ci_key* --dest=/tmp/ @@ -54,10 +98,13 @@ function create_host_file() { return 0 } -if create_host_file "$NODESTRING" "./hosts" "/etc/hosts"; then - retry_cmd 300 clush -B -S -l root -w "$NODESTRING" -c ./hosts --dest=/etc/hosts -else - echo "ERROR: Failed to create host file" +if [ "$NODESTRING" != "localhost" ]; then + if create_host_file "$NODESTRING" "./hosts" "/etc/hosts"; then + retry_cmd 300 clush -B -S -l root -w "$NODESTRING" \ + -c ./hosts --dest=/etc/hosts + else + echo "ERROR: Failed to create host file" + fi fi @@ -67,6 +114,7 @@ sanitized_commit_message="$(echo "$COMMIT_MESSAGE" | sed -e 's/\(["\$]\)/\\\1/g' if ! retry_cmd 2400 clush -B -S -l root -w "$NODESTRING" \ "export PS4='$PS4' MY_UID=$(id -u) + CI_USER=\"${CI_USER}\" CONFIG_POWER_ONLY=${CONFIG_POWER_ONLY:-} INST_REPOS=\"${INST_REPOS:-}\" INST_RPMS=\"${INST_RPMS:-}\" @@ -74,15 +122,15 @@ if ! retry_cmd 2400 clush -B -S -l root -w "$NODESTRING" \ REPOSITORY_URL=\"${REPOSITORY_URL:-}\" JENKINS_URL=\"${JENKINS_URL:-}\" DISTRO=\"$DISTRO\" - DAOS_STACK_RETRY_DELAY_SECONDS=\"$DAOS_STACK_RETRY_DELAY_SECONDS\" - DAOS_STACK_RETRY_COUNT=\"$DAOS_STACK_RETRY_COUNT\" + DAOS_STACK_RETRY_DELAY_SECONDS=\"${DAOS_STACK_RETRY_DELAY_SECONDS:-}\" + DAOS_STACK_RETRY_COUNT=\"${DAOS_STACK_RETRY_COUNT:-}\" MLNX_VER_NUM=\"$MLNX_VER_NUM\" - BUILD_URL=\"$BUILD_URL\" - STAGE_NAME=\"$STAGE_NAME\" - OPERATIONS_EMAIL=\"$OPERATIONS_EMAIL\" + BUILD_URL=\"${BUILD_URL:-}\" + STAGE_NAME=\"${STAGE_NAME:-}\" + OPERATIONS_EMAIL=\"${OPERATIONS_EMAIL:-}\" COMMIT_MESSAGE=\"$sanitized_commit_message\" REPO_FILE_URL=\"$REPO_FILE_URL\" - ARTIFACTORY_URL=\"${ARTIFACTORY_URL:-}\" + ARTIFACTORY_URL=\"${ARTIFACTORY_URL}\" BRANCH_NAME=\"${BRANCH_NAME:-}\" CHANGE_TARGET=\"${CHANGE_TARGET:-}\" CI_RPM_TEST_VERSION=\"${CI_RPM_TEST_VERSION:-}\" @@ -91,12 +139,13 @@ if ! retry_cmd 2400 clush -B -S -l root -w "$NODESTRING" \ REPO_PATH=\"${REPO_PATH:-}\" ARTIFACTS_URL=\"${ARTIFACTS_URL:-}\" COVFN_DISABLED=\"${COVFN_DISABLED:-true}\" - DAOS_CI_INFO_DIR=\"${DAOS_CI_INFO_DIR:-wolf-2:/export/scratch}\" + DAOS_CI_INFO_DIR=\"${DAOS_CI_INFO_DIR:-}\" + CI_SCONS_ARGS=\"${CI_SCONS_ARGS:-}\" $(cat ci/stacktrace.sh) $(cat ci/junit.sh) $(cat ci/provisioning/post_provision_config_common_functions.sh) $(cat ci/provisioning/post_provision_config_common.sh) - $(cat ci/provisioning/post_provision_config_nodes_"$DISTRO".sh) + $(cat ci/provisioning/post_provision_config_nodes_"$FAMILY".sh) $(cat ci/provisioning/post_provision_config_nodes.sh)"; then report_junit post_provision_config.sh results.xml "$NODESTRING" exit 1 @@ -105,7 +154,7 @@ fi git log --format=%B -n 1 HEAD | sed -ne '1s/^\([A-Z][A-Z]*-[0-9][0-9]*\) .*/\1/p' \ -e '/^Fixes:/{s/^Fixes: *//;s/ /\ /g;p}' | \ - retry_cmd 60 ssh -i ci_key -l jenkins "${NODELIST%%,*}" \ + retry_cmd 60 ssh -i ci_key -l "$CI_USER" "${NODESTRING%%,*}" \ "cat >/tmp/commit_fixes" git log --pretty=format:%h --abbrev-commit --abbrev=7 | - retry_cmd 60 ssh -i ci_key -l jenkins "${NODELIST%%,*}" "cat >/tmp/commit_list" + retry_cmd 60 ssh -i ci_key -l "$CI_USER" "${NODESTRING%%,*}" "cat >/tmp/commit_list" diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index a4e847ce17a..5f17916e934 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -94,6 +94,13 @@ retry_dnf() { send_mail "Command retry successful in $STAGE_NAME after $((attempt + 1)) attempts using ${repo_servers[0]} as initial repo server " \ "Command: ${args[*]}\nAttempts: $attempt\nStatus: $rc" fi + if [ -n "$ARTIFACTORY_URL" ]; then + dnfx="dnf" + if command -v dnf4; then + dnfx="dnf4" + fi + "$dnfx" config-manager --disable 'epel*' || true + fi return 0 fi # Command failed, retry @@ -229,21 +236,6 @@ timeout_cmd() { return "$rc" } -fetch_repo_config() { - local repo_server="$1" - - # shellcheck disable=SC1091 - . /etc/os-release - local repo_file="daos_ci-${ID}${VERSION_ID%%.*}-$repo_server" - local repopath="${REPOS_DIR}/$repo_file" - if ! curl -f -o "$repopath" "$REPO_FILE_URL$repo_file.repo"; then - echo "Failed to fetch repo file $REPO_FILE_URL$repo_file.repo" - return 1 - fi - - return 0 -} - pr_repos() { if [ -n "$CI_PR_REPOS" ]; then echo "$CI_PR_REPOS" @@ -274,9 +266,6 @@ set_local_repo() { # shellcheck disable=SC1091 . /etc/os-release - rm -f "$REPOS_DIR/daos_ci-${ID}${VERSION_ID%%.*}".repo - ln "$REPOS_DIR/daos_ci-${ID}${VERSION_ID%%.*}"{-"$repo_server",.repo} - if [ "$repo_server" = "artifactory" ]; then if { [[ \ $(pr_repos) = *\ daos@PR-* ]] || [ -z "$(rpm_test_version)" ]; } && [[ ! ${CHANGE_TARGET:-$BRANCH_NAME} =~ ^[-.0-9A-Za-z]+-testing ]]; then @@ -298,30 +287,6 @@ set_local_repo() { update_repos() { local DISTRO_NAME="$1" - # Update the repo files - local repo_server - for repo_server in "${repo_servers[@]}"; do - if ! fetch_repo_config "$repo_server"; then - # leave the existing on-image repo config alone if the repo fetch fails - send_mail "Fetch repo file for repo server \"$repo_server\" failed. Continuing on with in-image repos." - echo "Fetch repo file for repo server \"$repo_server\" failed. Continuing on with in-image repos." - return 1 - fi - done - - # we're not actually using the set_local_repos.sh script - # setting a repo server is as easy as renaming a file - #if ! curl -o /usr/local/sbin/set_local_repos.sh-tmp "${REPO_FILE_URL}set_local_repos.sh"; then - # send_mail "Fetch set_local_repos.sh failed. Continuing on with in-image copy." - #else - # cat /usr/local/sbin/set_local_repos.sh-tmp > /usr/local/sbin/set_local_repos.sh - # chmod +x /usr/local/sbin/set_local_repos.sh - # rm -f /usr/local/sbin/set_local_repos.sh-tmp - #fi - - # successfully grabbed them all, so replace the entire $REPOS_DIR - # content with them - # This is not working right on a second run. # using a quick hack to stop deleting a critical repo local file @@ -353,11 +318,11 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi - if $CONFIG_POWER_ONLY; then + if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ - ompi argobots cart daos daos-client dpdk \ - fuse-libs libisa-l libpmemobj mercury mpich \ + argobots cart daos daos-client dpdk \ + libisa-l libpmemobj mercury mpich \ pmix protobuf-c spdk libfabric libpmem \ munge-libs munge slurm \ slurm-example-configs slurmctld slurm-slurmmd @@ -365,7 +330,27 @@ post_provision_config_nodes() { cat /etc/os-release - if lspci | grep "ConnectX-6" && ! grep MOFED_VERSION /etc/do-release; then + # ConnectX must be 5 or later to support MOFED/DOCA drivers + # RoCE tests with Mellanox adapters may use MOFED/DOCA drivers. + last_pci_bus='' + mellanox_drivers=false + while IFS= read -r line; do + pci_bus="${line%.*}" + if [ "$pci_bus" == "$last_pci_bus" ]; then + # We only use one interface on a dual interface HBA + # Fortunately lspci appears to group them together + continue + fi + last_pci_bus="$pci_bus" + mlnx_type="${line##*ConnectX-}" + mlnx_type="${mlnx_type%]*}" + if [ "$mlnx_type" -ge 5 ]; then + mellanox_drivers=true + break + fi + done < <(lspci -mm | grep "ConnectX") + + if "$mellanox_drivers"; then # Remove OPA and install MOFED install_mofed fi diff --git a/ci/provisioning/post_provision_config_nodes.sh b/ci/provisioning/post_provision_config_nodes.sh index 14ac540d3a4..47bacfe4ec3 100644 --- a/ci/provisioning/post_provision_config_nodes.sh +++ b/ci/provisioning/post_provision_config_nodes.sh @@ -16,14 +16,29 @@ if command -v dnf; then bootstrap_dnf fi -if ! grep ":$MY_UID:" /etc/group; then - groupadd -g "$MY_UID" jenkins -fi +# If in CI use made up user "Jenkins" with UID that the build agent is +# currently using. Not sure that the UID is actually important any more +# and that parameter can probably be removed in the future. +# Nothing actually cares what the account name is as long as it does not +# conflict with an existing name and we are consistent in its use. +CI_USER="jenkins" + mkdir -p /localhome -if ! grep ":$MY_UID:$MY_UID:" /etc/passwd; then - useradd -b /localhome -g "$MY_UID" -u "$MY_UID" -s /bin/bash jenkins +if ! getent passwd "$CI_USER"; then + # If that UID already exists, then this is not being run in CI. + if ! getent passwd "$MY_UID"; then + if ! getent group "$MY_UID"; then + groupadd -g "$MY_UID" "$CI_USER" + fi + useradd -b /localhome -g "$MY_UID" -u "$MY_UID" -s /bin/bash "$CI_USER" + else + # Still need a "$CI_USER" account, so just make one up. + useradd -b /localhome -s /bin/bash "$CI_USER" + fi fi -jenkins_ssh=/localhome/jenkins/.ssh +ci_uid="$(id -u $CI_USER)" +ci_gid="$(id -g $CI_USER)" +jenkins_ssh=/localhome/"$CI_USER"/.ssh mkdir -p "${jenkins_ssh}" if ! grep -q -s -f /tmp/ci_key.pub "${jenkins_ssh}/authorized_keys"; then cat /tmp/ci_key.pub >> "${jenkins_ssh}/authorized_keys" @@ -37,12 +52,18 @@ cp /tmp/ci_key "${jenkins_ssh}/id_rsa" cp /tmp/ci_key_ssh_config "${jenkins_ssh}/config" chmod 700 "${jenkins_ssh}" chmod 600 "${jenkins_ssh}"/{authorized_keys,id_rsa*,config} -chown -R jenkins.jenkins /localhome/jenkins/ -echo "jenkins ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/jenkins +chown -R "${ci_uid}.${ci_gid}" "/localhome/${CI_USER}/" +echo "$CI_USER ALL=(ALL) NOPASSWD: ALL" > "/etc/sudoers.d/$CI_USER" -# /scratch is needed on test nodes -mkdir -p /scratch -retry_cmd 2400 mount "${DAOS_CI_INFO_DIR}" /scratch +# /scratch is needed on test nodes to be CI info for now. +# DAOS tests need to be changed to use /CIShare instead. +if [ -n "$DAOS_CI_INFO_DIR" ]; then + mkdir -p /CIShare + retry_cmd 2400 mount "${DAOS_CI_INFO_DIR}" /CIShare + # This part only until DAOS is migrated to use /CIShare + rm -f /scratch + ln -sfn /CIShare /scratch +fi # defined in ci/functional/post_provision_config_nodes_.sh # and catted to the remote node along with this script diff --git a/ci/provisioning/post_provision_config_nodes_EL_8.sh b/ci/provisioning/post_provision_config_nodes_EL.sh similarity index 71% rename from ci/provisioning/post_provision_config_nodes_EL_8.sh rename to ci/provisioning/post_provision_config_nodes_EL.sh index 27b35fb4b15..75e1d7934e3 100644 --- a/ci/provisioning/post_provision_config_nodes_EL_8.sh +++ b/ci/provisioning/post_provision_config_nodes_EL.sh @@ -66,27 +66,26 @@ install_mofed() { gversion="${gversion%.*}" fi - # Add a repo to install Mellanox_OFED RPMS - : "${ARTIFACTORY_URL:=https://artifactory.dc.hpdd.intel.com/artifactory/}" - # Temporary fix - if [[ ${ARTIFACTORY_URL} != *"/artifactory" ]]; then - ARTIFACTORY_URL="${ARTIFACTORY_URL}artifactory" + : "${ARTIFACTORY_URL:=}" + if [ -z "$ARTIFACTORY_URL" ]; then + return fi - mellanox_proxy="${ARTIFACTORY_URL}/mellanox-proxy/mlnx_ofed/" - mellanox_key_url="${ARTIFACTORY_URL}/mlnx_ofed/RPM-GPG-KEY-Mellanox" - rpm --import "$mellanox_key_url" - repo_url="$mellanox_proxy$MLNX_VER_NUM/rhel$gversion/x86_64/" - dnf -y config-manager --add-repo="$repo_url" - dnf -y config-manager --save --setopt="$(url_to_repo "$repo_url")".gpgcheck=1 - dnf repolist || true - time dnf -y install mlnx-ofed-basic ucx-cma ucx-ib ucx-knem ucx-rdmacm ucx-xpmem + # Install Mellanox OFED or DOCA RPMS + install_mellanox="install_mellanox.sh" + script_url="${ARTIFACTORY_URL}/raw-internal/sre_tools/$install_mellanox" + install_target="/usr/local/sbin/$install_mellanox" - # now, upgrade firmware - time dnf -y install mlnx-fw-updater + if [ ! -e "$install_target" ]; then + if ! curl --silent --show-error --fail \ + -o "/usr/local/sbin/$install_mellanox" "$script_url"; then + echo "Failed to fetch $script_url" + return 1 + fi + chmod 0755 "$install_target" + fi - # Make sure that tools are present. - #ls /usr/bin/ib_* /usr/bin/ibv_* + MELLANOX_VERSION="$MLNX_VER_NUM" "$install_mellanox" dnf list --showduplicates perftest if [ "$gversion" == "8.5" ]; then @@ -96,5 +95,4 @@ install_mofed() { dnf list --showduplicates ucx-knem dnf remove -y ucx-knem || true fi - } diff --git a/ci/provisioning/post_provision_config_nodes_LEAP_15.sh b/ci/provisioning/post_provision_config_nodes_LEAP.sh old mode 100755 new mode 100644 similarity index 95% rename from ci/provisioning/post_provision_config_nodes_LEAP_15.sh rename to ci/provisioning/post_provision_config_nodes_LEAP.sh index 5a266860945..4e6dc3e4e41 --- a/ci/provisioning/post_provision_config_nodes_LEAP_15.sh +++ b/ci/provisioning/post_provision_config_nodes_LEAP.sh @@ -8,7 +8,7 @@ bootstrap_dnf() { rm -rf "$REPOS_DIR" ln -s ../zypp/repos.d "$REPOS_DIR" - dnf -y remove lua-lmod + dnf -y remove lua54 lua-lmod dnf -y --nogpgcheck install lua-lmod '--repo=*lua*' --repo '*network-cluster*' } diff --git a/ci/provisioning/post_provision_config_nodes_UBUNTU_20_04.sh b/ci/provisioning/post_provision_config_nodes_UBUNTU.sh old mode 100755 new mode 100644 similarity index 100% rename from ci/provisioning/post_provision_config_nodes_UBUNTU_20_04.sh rename to ci/provisioning/post_provision_config_nodes_UBUNTU.sh diff --git a/debian/changelog b/debian/changelog index fa102c27f4c..219e1430fc7 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.4-2) unstable; urgency=medium + [ Dalton Bohning ] + * Second release candidate for 2.6.4 + + -- Dalton Bohning Tue, 05 Aug 2025 08:30:00 -0800 + daos (2.6.4-1) unstable; urgency=medium [ Phillip Henderson ] * First release candidate for 2.6.4 diff --git a/docs/admin/administration.md b/docs/admin/administration.md index e4b37646670..f26f4c4ac88 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -1101,6 +1101,28 @@ After extending the system, the cache of the `daos_agent` service of the client nodes needs to be refreshed. For detailed information, please refer to the [1][System Deployment documentation]. +### Adding or removing Management Service (MS) replicas + +The DAOS Management Service (MS) runs on a selected subset of `daos_server` nodes. +An administrator may add or remove hosts from the MS replica list. + +1. Stop I/O and safely shut down all `daos_server` and `daos_agent` processes. +2. Update the `mgmt_svc_replicas` list in the `daos_server` configuration file. +3. Update the `access_points` list in the `daos_agent` configuration file. +4. Update `hostlist` in the `dmg` configuration file, if applicable. +5. Restart all `daos_server` and `daos_agent` processes. + +To verify that the updated MS replicas came up correctly: +1. Use the `dmg system query` command to check that all expected ranks have come up in the Joined state. + The command should not time out. +2. Use the `dmg system leader-query` to ensure a leader election has completed. + +!!! warning + When removing or replacing MS replicas, do *not* replace all old replicas with + new ones. + At least one old replica must remain in the list to act as a data source for + the new replicas. + ## Software Upgrade diff --git a/src/bio/bio_config.c b/src/bio/bio_config.c index 2be2854b841..27764a66ae3 100644 --- a/src/bio/bio_config.c +++ b/src/bio/bio_config.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2021-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -19,6 +20,8 @@ /* JSON tags should match encode/decode logic in src/control/server/storage/bdev/backend_json.go */ #define JSON_MAX_CHARS 4096 +#define JSON_NOT_FOUND 1 +#define BDEV_NAME_MAX_LEN 256 struct json_config_ctx { @@ -374,14 +377,6 @@ read_config(const char *config_file, struct json_config_ctx *ctx) return rc; } -static void -free_json_config_ctx(struct json_config_ctx *ctx) -{ - D_FREE(ctx->values); - D_FREE(ctx->json_data); - D_FREE(ctx); -} - static int load_vmd_subsystem_config(struct json_config_ctx *ctx, bool *vmd_enabled) { @@ -489,8 +484,6 @@ add_traddrs_from_bdev_subsys(struct json_config_ctx *ctx, bool vmd_enabled, return rc; } -#define BDEV_NAME_MAX_LEN 256 - static int check_name_from_bdev_subsys(struct json_config_ctx *ctx) { @@ -732,19 +725,24 @@ bio_add_allowed_alloc(const char *nvme_conf, struct spdk_env_opts *opts, int *ro rc = add_bdevs_to_opts(ctx, bdev_ss, *vmd_enabled, opts); out: - free_json_config_ctx(ctx); + D_FREE(ctx->values); + D_FREE(ctx->json_data); + D_FREE(ctx); return rc; } +/* Caller responsible for freeing json_config_ctx if returned non-NULL */ static int -decode_daos_data(const char *nvme_conf, const char *method_name, struct config_entry *cfg) +decode_daos_object(const char *nvme_conf, const char *method_name, + const struct spdk_json_object_decoder *decoders, size_t num_decoders, void *out) { - struct json_config_ctx *ctx; + struct json_config_ctx *ctx; struct spdk_json_val *daos_data; + struct config_entry cfg = {}; int rc = 0; D_ASSERT(nvme_conf != NULL); - D_ASSERT(cfg != NULL); + D_ASSERT(method_name != NULL); D_ALLOC_PTR(ctx); if (ctx == NULL) @@ -752,7 +750,7 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e rc = read_config(nvme_conf, ctx); if (rc != 0) - D_GOTO(out, rc); + goto out; /* Capture daos_data JSON object */ rc = spdk_json_find(ctx->values, "daos_data", NULL, &daos_data, @@ -762,6 +760,8 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e D_GOTO(out, rc = -DER_INVAL); } + D_ASSERT(daos_data != NULL); + /* Capture config array in ctx */ rc = spdk_json_decode_object(daos_data, daos_data_decoders, SPDK_COUNTOF(daos_data_decoders), ctx); @@ -773,19 +773,19 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e /* Get 'config' array first configuration entry */ ctx->config_it = spdk_json_array_first(ctx->config); if (ctx->config_it == NULL) { - /* Entry not-found so return positive RC */ - D_GOTO(out, rc = 1); + D_DEBUG(DB_MGMT, "%s entry not found in spdk bootstrap config\n", method_name); + D_GOTO(out, rc = JSON_NOT_FOUND); } while (ctx->config_it != NULL) { rc = spdk_json_decode_object(ctx->config_it, config_entry_decoders, - SPDK_COUNTOF(config_entry_decoders), cfg); + SPDK_COUNTOF(config_entry_decoders), &cfg); if (rc < 0) { D_ERROR("Failed to decode 'config' entry: %s\n", spdk_strerror(-rc)); D_GOTO(out, rc = -DER_INVAL); } - if (strcmp(cfg->method, method_name) == 0) + if (strcmp(cfg.method, method_name) == 0) break; /* Move on to next subsystem config */ @@ -793,46 +793,27 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e } if (ctx->config_it == NULL) { - /* Entry not-found so return positive RC */ - rc = 1; + D_DEBUG(DB_MGMT, "%s entry not found in spdk bootstrap config\n", method_name); + D_GOTO(out, rc = JSON_NOT_FOUND); } -out: - free_json_config_ctx(ctx); - return rc; -} - -struct busid_range_info hotplug_busid_range = {}; - -static int -get_hotplug_busid_range(const char *nvme_conf) -{ - struct config_entry cfg = {}; - int rc; - - rc = decode_daos_data(nvme_conf, NVME_CONF_SET_HOTPLUG_RANGE, &cfg); - if (rc != 0) - goto out; - rc = spdk_json_decode_object(cfg.params, busid_range_decoders, - SPDK_COUNTOF(busid_range_decoders), - &hotplug_busid_range); + rc = spdk_json_decode_object(cfg.params, decoders, num_decoders, out); if (rc < 0) { D_ERROR("Failed to decode '%s' entry: %s)\n", NVME_CONF_SET_HOTPLUG_RANGE, spdk_strerror(-rc)); D_GOTO(out, rc = -DER_INVAL); } - D_INFO("'%s' read from config: %X-%X\n", NVME_CONF_SET_HOTPLUG_RANGE, - hotplug_busid_range.begin, hotplug_busid_range.end); out: - if (cfg.method != NULL) - D_FREE(cfg.method); - /* Decode functions return positive RC for success or not-found */ - if (rc > 0) - rc = 0; - return 0; + D_FREE(cfg.method); + D_FREE(ctx->values); + D_FREE(ctx->json_data); + D_FREE(ctx); + return rc; } +struct busid_range_info hotplug_busid_range = {}; + static bool hotplug_filter_fn(const struct spdk_pci_addr *addr) { @@ -864,15 +845,20 @@ bio_set_hotplug_filter(const char *nvme_conf) { int rc; - D_ASSERT(nvme_conf != NULL); - - rc = get_hotplug_busid_range(nvme_conf); - if (rc != 0) + rc = decode_daos_object(nvme_conf, NVME_CONF_SET_HOTPLUG_RANGE, busid_range_decoders, + SPDK_COUNTOF(busid_range_decoders), &hotplug_busid_range); + if (rc != 0) { + if (rc == JSON_NOT_FOUND) + rc = 0; return rc; + } + + D_INFO("'%s' read from config: %X-%X\n", NVME_CONF_SET_HOTPLUG_RANGE, + hotplug_busid_range.begin, hotplug_busid_range.end); spdk_nvme_pcie_set_hotplug_filter(hotplug_filter_fn); - return rc; + return 0; } /** @@ -886,23 +872,15 @@ bio_set_hotplug_filter(const char *nvme_conf) int bio_read_accel_props(const char *nvme_conf) { - struct config_entry cfg = {}; struct accel_props_info accel_props = {}; int rc; - D_ASSERT(nvme_conf != NULL); - - rc = decode_daos_data(nvme_conf, NVME_CONF_SET_ACCEL_PROPS, &cfg); - if (rc != 0) - goto out; - - rc = spdk_json_decode_object(cfg.params, accel_props_decoders, - SPDK_COUNTOF(accel_props_decoders), - &accel_props); - if (rc < 0) { - D_ERROR("Failed to decode '%s' entry (%s)\n", NVME_CONF_SET_ACCEL_PROPS, - spdk_strerror(-rc)); - D_GOTO(out, rc = -DER_INVAL); + rc = decode_daos_object(nvme_conf, NVME_CONF_SET_ACCEL_PROPS, accel_props_decoders, + SPDK_COUNTOF(accel_props_decoders), &accel_props); + if (rc != 0) { + if (rc == JSON_NOT_FOUND) + rc = 0; + return rc; } D_INFO("'%s' read from config, setting: %s, capabilities: move=%s,crc=%s\n", @@ -911,13 +889,8 @@ bio_read_accel_props(const char *nvme_conf) CHK_FLAG(accel_props.opt_mask, NVME_ACCEL_FLAG_CRC) ? "true" : "false"); /* TODO: do something useful with acceleration engine properties */ -out: - if (cfg.method != NULL) - D_FREE(cfg.method); - /* Decode functions return positive RC for success or not-found */ - if (rc > 0) - rc = 0; - return rc; + + return 0; } /** @@ -934,25 +907,19 @@ bio_read_accel_props(const char *nvme_conf) int bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock_addr) { - struct config_entry cfg = {}; struct rpc_srv_info rpc_srv_settings = {}; int rc; - D_ASSERT(nvme_conf != NULL); D_ASSERT(enable != NULL); D_ASSERT(sock_addr != NULL); D_ASSERT(*sock_addr == NULL); - rc = decode_daos_data(nvme_conf, NVME_CONF_SET_SPDK_RPC_SERVER, &cfg); - if (rc != 0) - goto out; - - rc = spdk_json_decode_object(cfg.params, rpc_srv_decoders, SPDK_COUNTOF(rpc_srv_decoders), - &rpc_srv_settings); - if (rc < 0) { - D_ERROR("Failed to decode '%s' entry: %s)\n", NVME_CONF_SET_SPDK_RPC_SERVER, - spdk_strerror(-rc)); - D_GOTO(out, rc = -DER_INVAL); + rc = decode_daos_object(nvme_conf, NVME_CONF_SET_SPDK_RPC_SERVER, rpc_srv_decoders, + SPDK_COUNTOF(rpc_srv_decoders), &rpc_srv_settings); + if (rc != 0) { + if (rc == JSON_NOT_FOUND) + rc = 0; + return rc; } *enable = rpc_srv_settings.enable; @@ -960,13 +927,8 @@ bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock D_INFO("'%s' read from config: enabled=%d, addr %s\n", NVME_CONF_SET_SPDK_RPC_SERVER, *enable, (char *)*sock_addr); -out: - if (cfg.method != NULL) - D_FREE(cfg.method); - /* Decode functions return positive RC for success or not-found */ - if (rc > 0) - rc = 0; - return rc; + + return 0; } /** @@ -984,28 +946,34 @@ int bio_read_auto_faulty_criteria(const char *nvme_conf, bool *enable, uint32_t *max_io_errs, uint32_t *max_csum_errs) { - struct config_entry cfg = {}; struct auto_faulty_info auto_faulty_criteria = {}; int rc; - rc = decode_daos_data(nvme_conf, NVME_CONF_SET_AUTO_FAULTY, &cfg); - if (rc != 0) - goto out; - - rc = spdk_json_decode_object(cfg.params, auto_faulty_decoders, - SPDK_COUNTOF(auto_faulty_decoders), &auto_faulty_criteria); - if (rc < 0) { - D_ERROR("Failed to decode '%s' entry: %s)\n", NVME_CONF_SET_AUTO_FAULTY, - spdk_strerror(-rc)); - D_GOTO(out, rc = -DER_INVAL); + D_ASSERT(enable != NULL); + D_ASSERT(max_io_errs != NULL); + D_ASSERT(max_csum_errs != NULL); + + rc = decode_daos_object(nvme_conf, NVME_CONF_SET_AUTO_FAULTY, auto_faulty_decoders, + SPDK_COUNTOF(auto_faulty_decoders), &auto_faulty_criteria); + if (rc != 0) { + if (rc == JSON_NOT_FOUND) { + rc = 0; + *enable = false; + *max_io_errs = UINT32_MAX; + *max_csum_errs = UINT32_MAX; + D_DEBUG(DB_MGMT, "bdev auto-faulty criteria disabled as not configured\n"); + } + return rc; } *enable = auto_faulty_criteria.enable; if (*enable == false) { *max_io_errs = UINT32_MAX; *max_csum_errs = UINT32_MAX; - goto out; + D_DEBUG(DB_MGMT, "bdev auto-faulty criteria disabled\n"); + return 0; } + *max_io_errs = auto_faulty_criteria.max_io_errs; if (*max_io_errs == 0) *max_io_errs = UINT32_MAX; @@ -1013,16 +981,10 @@ bio_read_auto_faulty_criteria(const char *nvme_conf, bool *enable, uint32_t *max if (*max_csum_errs == 0) *max_csum_errs = UINT32_MAX; -out: D_INFO("NVMe auto faulty is %s. Criteria: max_io_errs:%u, max_csum_errs:%u\n", *enable ? "enabled" : "disabled", *max_io_errs, *max_csum_errs); - if (cfg.method != NULL) - D_FREE(cfg.method); - /* Decode functions return positive RC for success or not-found */ - if (rc > 0) - rc = 0; - return rc; + return 0; } struct json_bdev_nvme_ctx { diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 3a3c9cb59dc..9ed8fb77314 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -623,15 +623,24 @@ fun:racecall } { - go 1.22.3 race + tsan::MemoryAccessRange Memcheck:Value8 fun:_ZN6__tsan18MemoryAccessRangeTILb0EEEvPNS_11ThreadStateEmmm + ... + fun:racecall +} +{ + tsan::MemoryAccessRange + Memcheck:Value8 + fun:_ZN6__tsan18MemoryAccessRangeTILb1EEEvPNS_11ThreadStateEmmm + ... fun:racecall } { - go 1.22.3 race + tsan::TraceRestartMemoryAccess Memcheck:Value8 fun:_ZN6__tsan24TraceRestartMemoryAccessEPNS_11ThreadStateEmmmm + ... fun:racecall } { diff --git a/src/client/api/event.c b/src/client/api/event.c index af3895ff32b..fc568e7bf8c 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -104,6 +104,9 @@ daos_eq_lib_init(crt_init_options_t *crt_info) unlock: D_MUTEX_UNLOCK(&daos_eq_lock); + if (rc == 0) + daos_dti_reset(); + return rc; crt: crt_finalize(); diff --git a/src/client/dfuse/il/int_posix.c b/src/client/dfuse/il/int_posix.c index 29a8620cd21..e068651a5d3 100644 --- a/src/client/dfuse/il/int_posix.c +++ b/src/client/dfuse/il/int_posix.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/common/misc.c b/src/common/misc.c index cda884c6126..e5dddf0a64e 100644 --- a/src/common/misc.c +++ b/src/common/misc.c @@ -1,6 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -668,7 +669,16 @@ daos_crt_init_opt_get(bool server, int ctx_nr) return &daos_crt_init_opt; } -static __thread uuid_t dti_uuid; +/* For new created thread (via pthread_create), the pre-thread variable \@dti_uuid will be + * automatically reset as zero, then subsequent daos_dti_gen() will generate new UUID for + * the transactions sponsored by current thread. + * + * For new created process (via fork), the pre-thread variable \@dti_uuid will be reset as + * zero via daos_dti_reset(). The process owner needs to explicitly call it or in-directly + * trigger it (such as via daos_eq_lib_init) to guarantee that new process will not reuse + * parent's UUID for the transactions sponsored by current process. + */ +static __thread uuid_t dti_uuid = {0}; void daos_dti_gen_unique(struct dtx_id *dti) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b983ed32f75..f55cdf36eeb 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1472,27 +1472,20 @@ ds_cont_child_put(struct ds_cont_child *cont) cont_child_put(tls->dt_cont_cache, cont); } -struct ds_dtx_resync_args { - struct ds_pool_child *pool; - uuid_t co_uuid; -}; - static void ds_dtx_resync(void *arg) { - struct ds_dtx_resync_args *ddra = arg; - int rc; + struct ds_cont_child *cont = arg; + int rc; - rc = dtx_resync(ddra->pool->spc_hdl, ddra->pool->spc_uuid, - ddra->co_uuid, ddra->pool->spc_map_version, false); + rc = dtx_resync(cont->sc_pool->spc_hdl, cont, cont->sc_pool->spc_map_version, false); if (rc != 0) D_WARN("Fail to resync some DTX(s) for the pool/cont " DF_UUID "/" DF_UUID " that may affect subsequent " "operations: rc = " DF_RC "\n", - DP_UUID(ddra->pool->spc_uuid), DP_UUID(ddra->co_uuid), DP_RC(rc)); + DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), DP_RC(rc)); - ds_pool_child_put(ddra->pool); - D_FREE(ddra); + ds_cont_child_put(cont); } int @@ -1602,8 +1595,6 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, * Both cases are not expected. */ if (cont_uuid != NULL && !uuid_is_null(cont_uuid)) { - struct ds_dtx_resync_args *ddra = NULL; - /* * NB: When cont_uuid == NULL, it's not a real container open * but for creating rebuild global container handle. @@ -1633,25 +1624,11 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, D_GOTO(err_cont, rc); } - D_ALLOC_PTR(ddra); - if (ddra == NULL) { - ABT_mutex_unlock(hdl->sch_cont->sc_mutex); - D_GOTO(err_dtx, rc = -DER_NOMEM); - } - - ddra->pool = ds_pool_child_lookup(hdl->sch_cont->sc_pool->spc_uuid); - if (ddra->pool == NULL) { - ABT_mutex_unlock(hdl->sch_cont->sc_mutex); - D_FREE(ddra); - D_GOTO(err_dtx, rc = -DER_NO_HDL); - } - uuid_copy(ddra->co_uuid, cont_uuid); - rc = dss_ult_create(ds_dtx_resync, ddra, DSS_XS_SELF, - 0, 0, NULL); + ds_cont_child_get(hdl->sch_cont); + rc = dss_ult_create(ds_dtx_resync, hdl->sch_cont, DSS_XS_SELF, 0, 0, NULL); if (rc != 0) { ABT_mutex_unlock(hdl->sch_cont->sc_mutex); ds_pool_child_put(hdl->sch_cont->sc_pool); - D_FREE(ddra); D_GOTO(err_dtx, rc); } diff --git a/src/control/server/faultdomain_test.go b/src/control/server/faultdomain_test.go index 51e2fbe29a5..228be8486a8 100644 --- a/src/control/server/faultdomain_test.go +++ b/src/control/server/faultdomain_test.go @@ -1,5 +1,6 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -10,6 +11,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "testing" "github.com/pkg/errors" @@ -142,7 +144,7 @@ func TestServer_getFaultDomain(t *testing.T) { }, "default gets hostname": { cfg: &config.Server{}, - expResult: system.FaultDomainSeparator + realHostname, + expResult: system.FaultDomainSeparator + strings.ToLower(realHostname), }, } { t.Run(name, func(t *testing.T) { diff --git a/src/control/system/membership.go b/src/control/system/membership.go index 603948ff5f9..3ab5eb9b4ae 100644 --- a/src/control/system/membership.go +++ b/src/control/system/membership.go @@ -204,8 +204,8 @@ func (m *Membership) joinReplace(req *JoinRequest) (*JoinResponse, error) { memberToReplace := &Member{} *memberToReplace = *cm - m.log.Debugf("replace-rank: updating member with UUID %s->%s", memberToReplace.UUID, - req.UUID) + m.log.Debugf("replace-rank: updating member with UUID %s->%s, Addr %s", + memberToReplace.UUID, req.UUID, memberToReplace.Addr) if err := m.db.RemoveMember(cm); err != nil { return nil, errors.Wrap(err, "removing old member in replace-rank join request") diff --git a/src/control/system/raft/database.go b/src/control/system/raft/database.go index 9b4723a3826..5aedf7cd460 100644 --- a/src/control/system/raft/database.go +++ b/src/control/system/raft/database.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -819,6 +820,10 @@ func (db *Database) UpdateMember(m *system.Member) error { return err } + if err := db.manageVoter(m, raftOpUpdateMember); err != nil { + return err + } + return db.submitMemberUpdate(raftOpUpdateMember, &memberUpdate{Member: m}) } diff --git a/src/control/system/raft/database_members.go b/src/control/system/raft/database_members.go index f9775e912e9..14d6cc70925 100644 --- a/src/control/system/raft/database_members.go +++ b/src/control/system/raft/database_members.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -48,27 +49,31 @@ func (mrm MemberRankMap) MarshalJSON() ([]byte, error) { } func (mam MemberAddrMap) addMember(addr *net.TCPAddr, m *system.Member) { - if _, exists := mam[addr.String()]; !exists { - mam[addr.String()] = []*system.Member{} - } - mam[addr.String()] = append(mam[addr.String()], m) + as := addr.String() + + mam[as] = append(mam[as], m) } func (mam MemberAddrMap) removeMember(m *system.Member) { - members, exists := mam[m.Addr.String()] - if !exists { + mas := m.Addr.String() + + if _, exists := mam[mas]; !exists { return } - for i, cur := range members { - if m.UUID == cur.UUID { - // remove from slice - members = append(members[:i], members[i+1:]...) - break + + newMembers := []*system.Member{} + for _, cur := range mam[mas] { + if m.UUID != cur.UUID { + nm := *cur + newMembers = append(newMembers, &nm) } } - if len(members) == 0 { - delete(mam, m.Addr.String()) + if len(newMembers) == 0 { + delete(mam, mas) + return } + + mam[mas] = newMembers } // MarshalJSON creates a serialized representation of the MemberAddrMap. @@ -170,7 +175,7 @@ func (mdb *MemberDatabase) updateMember(m *system.Member) { mdb.addToFaultDomainTree(cur) } -// removeMember is responsible for removing new Member and updating all +// removeMember is responsible for removing Member and updating all // of the relevant maps. func (mdb *MemberDatabase) removeMember(m *system.Member) { delete(mdb.Ranks, m.Rank) diff --git a/src/control/system/raft/database_members_test.go b/src/control/system/raft/database_members_test.go new file mode 100644 index 00000000000..5c5984d71e9 --- /dev/null +++ b/src/control/system/raft/database_members_test.go @@ -0,0 +1,88 @@ +// +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package raft + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/google/uuid" + + "github.com/daos-stack/daos/src/control/system" +) + +func TestRaft_MemberAddrMap_removeMember(t *testing.T) { + makeMap := func(t *testing.T, members ...*system.Member) MemberAddrMap { + mam := make(MemberAddrMap) + for _, m := range members { + mam[m.Addr.String()] = append(mam[m.Addr.String()], m) + } + return mam + } + + testMember := func(t *testing.T, idx int) *system.Member { + return system.MockMember(t, uint32(idx), system.MemberStateJoined) + } + + members := func(t *testing.T, n int) []*system.Member { + members := []*system.Member{} + for i := 0; i < n; i++ { + members = append(members, testMember(t, i)) + } + return members + } + + for name, tc := range map[string]struct { + addrMap MemberAddrMap + toRemove *system.Member + expMap MemberAddrMap + }{ + "empty": { + addrMap: make(MemberAddrMap), + toRemove: &system.Member{ + Addr: system.MockControlAddr(t, 1), + UUID: uuid.New(), + }, + expMap: make(MemberAddrMap), + }, + "only one member": { + addrMap: makeMap(t, members(t, 1)...), + toRemove: testMember(t, 0), + expMap: make(MemberAddrMap), + }, + "first member": { + addrMap: makeMap(t, members(t, 3)...), + toRemove: testMember(t, 0), + expMap: makeMap(t, testMember(t, 1), testMember(t, 2)), + }, + "last member": { + addrMap: makeMap(t, members(t, 3)...), + toRemove: testMember(t, 2), + expMap: makeMap(t, testMember(t, 0), testMember(t, 1)), + }, + "middle member": { + addrMap: makeMap(t, members(t, 3)...), + toRemove: testMember(t, 1), + expMap: makeMap(t, testMember(t, 0), testMember(t, 2)), + }, + "not found": { + addrMap: makeMap(t, members(t, 3)...), + toRemove: testMember(t, 6), + expMap: makeMap(t, members(t, 3)...), + }, + } { + t.Run(name, func(t *testing.T) { + tc.addrMap.removeMember(tc.toRemove) + + cmpOpt := cmpopts.IgnoreFields(system.Member{}, "LastUpdate") + if diff := cmp.Diff(tc.expMap, tc.addrMap, cmpOpt); diff != "" { + t.Fatalf("unexpected MemberAddrMap (-want, +got):\n%s\n", diff) + } + }) + } +} diff --git a/src/control/system/raft/database_test.go b/src/control/system/raft/database_test.go index 9a51a969412..32e2a0e1b39 100644 --- a/src/control/system/raft/database_test.go +++ b/src/control/system/raft/database_test.go @@ -431,12 +431,55 @@ func ignoreFaultDomainIDOption() cmp.Option { }, cmp.Ignore()) } -func TestSystem_Database_memberRaftOps(t *testing.T) { - ctx := test.Context(t) +func checkMemberDB(t *testing.T, mdb *MemberDatabase, expMember *Member, opts ...cmp.Option) { + t.Helper() - testMembers := make([]*Member, 0) - nextAddr := ctrlAddrGen(ctx, net.IPv4(127, 0, 0, 1), 4) - for i := 0; i < 3; i++ { + cmpOpts := []cmp.Option{ + cmp.AllowUnexported(Member{}), + cmpopts.IgnoreFields(Member{}, "LastUpdate"), + } + cmpOpts = append(cmpOpts, opts...) + + uuidM, ok := mdb.Uuids[expMember.UUID] + if !ok { + t.Errorf("member not found for UUID %s", expMember.UUID) + } + if diff := cmp.Diff(expMember, uuidM, cmpOpts...); diff != "" { + t.Fatalf("member wrong in UUID DB (-want, +got):\n%s\n", diff) + } + + rankM, ok := mdb.Ranks[expMember.Rank] + if !ok { + t.Errorf("member not found for rank %d", expMember.Rank) + } + if diff := cmp.Diff(expMember, rankM, cmpOpts...); diff != "" { + t.Fatalf("member wrong in rank DB (-want, +got):\n%s\n", diff) + } + + addrMs, ok := mdb.Addrs[expMember.Addr.String()] + if !ok { + t.Errorf("slice not found for addr %s", expMember.Addr.String()) + } + + found := false + for _, am := range addrMs { + if am.Rank == expMember.Rank { + found = true + if diff := cmp.Diff(expMember, am, cmpOpts...); diff != "" { + t.Fatalf("member wrong in addr DB (-want, +got):\n%s\n", diff) + } + } + } + if !found { + t.Fatalf("expected member %+v not found for addr %s", expMember, expMember.Addr.String()) + } +} + +func genTestMembers(t *testing.T, num int) []*Member { + ctx := test.Context(t) + nextAddr := ctrlAddrGen(ctx, net.IPv4(127, 0, 0, 1), 1) + testMembers := make([]*Member, 0, num) + for i := 0; i < num; i++ { testMembers = append(testMembers, &Member{ Rank: Rank(i), UUID: uuid.New(), @@ -445,6 +488,29 @@ func TestSystem_Database_memberRaftOps(t *testing.T) { FaultDomain: MustCreateFaultDomainFromString("/rack0"), }) } + return testMembers +} + +func expectMembersInDB(t *testing.T, db *Database, expMembers []*Member) { + for _, expMember := range expMembers { + checkMemberDB(t, db.data.Members, expMember) + } + + if len(db.data.Members.Uuids) != len(expMembers) { + t.Fatalf("expected %d members, got %d", len(expMembers), len(db.data.Members.Uuids)) + } + + totalMemberAddrs := 0 + for _, members := range db.data.Members.Addrs { + totalMemberAddrs += len(members) + } + if totalMemberAddrs != len(expMembers) { + t.Fatalf("expected %d members in address table, got %d: %+v", len(expMembers), totalMemberAddrs, db.data.Members.Addrs) + } +} + +func TestSystem_Database_memberRaftOps(t *testing.T) { + testMembers := genTestMembers(t, 3) changedFaultDomainMember := &Member{ Rank: testMembers[1].Rank, @@ -454,10 +520,6 @@ func TestSystem_Database_memberRaftOps(t *testing.T) { FaultDomain: MustCreateFaultDomainFromString("/rack1"), } - cmpOpts := []cmp.Option{ - cmp.AllowUnexported(Member{}), - } - for name, tc := range map[string]struct { startingMembers []*Member op raftOp @@ -544,48 +606,127 @@ func TestSystem_Database_memberRaftOps(t *testing.T) { raftUpdateTestMember(t, db, tc.op, tc.updateMember) // Check member DB was updated - for _, expMember := range tc.expMembers { - uuidM, ok := db.data.Members.Uuids[expMember.UUID] - if !ok { - t.Errorf("member not found for UUID %s", expMember.UUID) - } - if diff := cmp.Diff(expMember, uuidM, cmpOpts...); diff != "" { - t.Fatalf("member wrong in UUID DB (-want, +got):\n%s\n", diff) - } + expectMembersInDB(t, db, tc.expMembers) - rankM, ok := db.data.Members.Ranks[expMember.Rank] - if !ok { - t.Errorf("member not found for rank %d", expMember.Rank) - } - if diff := cmp.Diff(expMember, rankM, cmpOpts...); diff != "" { - t.Fatalf("member wrong in rank DB (-want, +got):\n%s\n", diff) - } + if diff := cmp.Diff(tc.expFDTree, db.data.Members.FaultDomains, ignoreFaultDomainIDOption()); diff != "" { + t.Fatalf("wrong FaultDomainTree in DB (-want, +got):\n%s\n", diff) + } + }) + } +} - addrMs, ok := db.data.Members.Addrs[expMember.Addr.String()] - if !ok { - t.Errorf("slice not found for addr %s", expMember.Addr.String()) - } +func TestSystem_Database_UpdateMember(t *testing.T) { + const numReplicas = 3 + testMembers := genTestMembers(t, 5) + fakeUUID := uuid.MustParse("ffffffff-ffff-ffff-ffff-ffffffffffff") - found := false - for _, am := range addrMs { - if am.Rank == expMember.Rank { - found = true - if diff := cmp.Diff(expMember, am, cmpOpts...); diff != "" { - t.Fatalf("member wrong in addr DB (-want, +got):\n%s\n", diff) - } - } - } - if !found { - t.Fatalf("expected member %+v not found for addr %s", expMember, expMember.Addr.String()) - } + for name, tc := range map[string]struct { + startingMembers []*Member + notLeader bool + updateMember *Member + expMembers []*Member + expErr error + expVotersAdded []string + }{ + "not leader": { + startingMembers: testMembers, + updateMember: testMembers[1], + notLeader: true, + expMembers: testMembers, + expErr: errors.New("Management Service leader"), + }, + "member not found": { + startingMembers: testMembers, + updateMember: &Member{ + Rank: 123, + UUID: fakeUUID, + }, + expMembers: testMembers, + expErr: ErrMemberUUIDNotFound(fakeUUID), + }, + "update replica": { + startingMembers: testMembers, + updateMember: &Member{ + Rank: testMembers[1].Rank, + UUID: testMembers[1].UUID, + Addr: testMembers[1].Addr, + State: MemberStateReady, + FaultDomain: testMembers[1].FaultDomain, + }, + expMembers: []*Member{ + testMembers[0], + { + Rank: testMembers[1].Rank, + UUID: testMembers[1].UUID, + Addr: testMembers[1].Addr, + State: MemberStateReady, + FaultDomain: testMembers[1].FaultDomain, + }, + testMembers[2], + testMembers[3], + testMembers[4], + }, + expVotersAdded: []string{testMembers[1].Addr.String()}, + }, + "update non-replica": { + startingMembers: testMembers, + updateMember: &Member{ + Rank: testMembers[4].Rank, + UUID: testMembers[4].UUID, + Addr: testMembers[4].Addr, + State: MemberStateReady, + FaultDomain: testMembers[4].FaultDomain, + }, + expMembers: []*Member{ + testMembers[0], + testMembers[1], + testMembers[2], + testMembers[3], + { + Rank: testMembers[4].Rank, + UUID: testMembers[4].UUID, + Addr: testMembers[4].Addr, + State: MemberStateReady, + FaultDomain: testMembers[4].FaultDomain, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, _ := logging.NewTestLogger(t.Name()) + ctx := test.MustLogContext(t, log) + replicaAddrs := make([]*net.TCPAddr, 0, numReplicas) + for i := 0; i < numReplicas; i++ { + replicaAddrs = append(replicaAddrs, tc.startingMembers[i].Addr) } - if len(db.data.Members.Uuids) != len(tc.expMembers) { - t.Fatalf("expected %d members, got %d", len(tc.expMembers), len(db.data.Members.Uuids)) + t.Logf("%+v", replicaAddrs) + + db := MockDatabaseWithCfg(t, logging.FromContext(ctx), &DatabaseConfig{ + Replicas: replicaAddrs, + }) + db.replicaAddr = replicaAddrs[0] + + // setup initial member DB + for _, initMember := range tc.startingMembers { + raftUpdateTestMember(t, db, raftOpAddMember, initMember) } - if diff := cmp.Diff(tc.expFDTree, db.data.Members.FaultDomains, ignoreFaultDomainIDOption()); diff != "" { - t.Fatalf("wrong FaultDomainTree in DB (-want, +got):\n%s\n", diff) + mockRaftSvc, ok := db.raft.svc.(*mockRaftService) + if !ok { + t.Fatal("raft service used wasn't a mockRaftService!") + } + if tc.notLeader { + mockRaftSvc.cfg.State = raft.Follower + } + + err := db.UpdateMember(tc.updateMember) + + test.CmpErr(t, tc.expErr, err) + expectMembersInDB(t, db, tc.expMembers) + + if diff := cmp.Diff(tc.expVotersAdded, mockRaftSvc.addVoterCalledForAddrs); diff != "" { + t.Fatalf("wrong raft voters added (-want, +got):\n%s\n", diff) } }) } @@ -819,7 +960,7 @@ func TestSystem_Database_OnEvent(t *testing.T) { } } -func TestSystemDatabase_PoolServiceList(t *testing.T) { +func TestSystem_Database_PoolServiceList(t *testing.T) { ready := &PoolService{ PoolUUID: uuid.New(), PoolLabel: "pool0001", @@ -1061,7 +1202,7 @@ func TestSystem_Database_GroupMap(t *testing.T) { } } -func Test_Database_ResignLeadership(t *testing.T) { +func TestSystem_Database_ResignLeadership(t *testing.T) { for name, tc := range map[string]struct { cause error expErr error @@ -1116,7 +1257,7 @@ func Test_Database_ResignLeadership(t *testing.T) { } } -func TestDatabase_TakePoolLock(t *testing.T) { +func TestSystem_Database_TakePoolLock(t *testing.T) { mockUUID := uuid.MustParse(test.MockUUID(1)) parentLock := makeLock(1, 1, 1) wrongIdLock := makeLock(1, 2, 1) diff --git a/src/control/system/raft/mocks.go b/src/control/system/raft/mocks.go index ef82a3b95b0..9f6b9a846e8 100644 --- a/src/control/system/raft/mocks.go +++ b/src/control/system/raft/mocks.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,8 +35,9 @@ type ( BarrierReturn raft.Future } mockRaftService struct { - cfg mockRaftServiceConfig - fsm raft.FSM + cfg mockRaftServiceConfig + fsm raft.FSM + addVoterCalledForAddrs []string } ) @@ -49,7 +51,8 @@ func (mrs *mockRaftService) Apply(cmd []byte, timeout time.Duration) raft.ApplyF return &mockRaftFuture{} } -func (mr *mockRaftService) AddVoter(_ raft.ServerID, _ raft.ServerAddress, _ uint64, _ time.Duration) raft.IndexFuture { +func (mr *mockRaftService) AddVoter(_ raft.ServerID, addr raft.ServerAddress, _ uint64, _ time.Duration) raft.IndexFuture { + mr.addVoterCalledForAddrs = append(mr.addVoterCalledForAddrs, string(addr)) return &mockRaftFuture{} } @@ -92,6 +95,11 @@ func (mrs *mockRaftService) Barrier(time.Duration) raft.Future { return mrs.cfg.BarrierReturn } +// resetCalls resets call counts for the mock. +func (mrs *mockRaftService) resetCalls() { + mrs.addVoterCalledForAddrs = nil +} + func newMockRaftService(cfg *mockRaftServiceConfig, fsm raft.FSM) *mockRaftService { if cfg == nil { cfg = &mockRaftServiceConfig{ diff --git a/src/control/system/raft/raft.go b/src/control/system/raft/raft.go index ef30c226aa8..35a6535af6e 100644 --- a/src/control/system/raft/raft.go +++ b/src/control/system/raft/raft.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -296,6 +297,40 @@ func (db *Database) ConfigureTransport(srv *grpc.Server, dialOpts ...grpc.DialOp return nil } +type getCfgFn func(logging.Logger, *DatabaseConfig) (raft.Configuration, error) +type recoverFn func(logging.Logger, *DatabaseConfig) error + +// replicasRemoved checks whether replicas have been removed from the system configuration since the +// last time the raft service ran. Raft persists the list in its configuration. +func (db *Database) replicasRemoved(getCfg getCfgFn) (bool, error) { + raftCfg, err := getCfg(db.log, db.cfg) + if err != nil { + return false, errors.Wrap(err, "getting raft config") + } + + current := common.NewStringSet(db.cfg.stringReplicas()...) + for _, raftSrv := range raftCfg.Servers { + if !current.Has(string(raftSrv.Address)) { + db.log.Debugf("detected removed MS replica: %s", raftSrv.Address) + return true, nil + } + } + return false, nil +} + +// recoverIfReplicasRemoved detects whether nodes have been removed from the last known MS replica +// list. If so, it may not be possible to hold an election with the voter list in the raft config. +// We must recover the DB with a fresh raft configuration to allow an election to be held. +func (db *Database) recoverIfReplicasRemoved(getCfg getCfgFn, recover recoverFn) error { + if removed, err := db.replicasRemoved(getCfg); err != nil { + return errors.Wrap(err, "checking for removed replicas") + } else if removed { + db.log.Infof("detected at least one MS replica removed, attempting to recover raft DB") + return recover(db.log, db.cfg) + } + return nil +} + // initRaft sets up the backing raft service for use. If the service has // already been bootstrapped, then it will start immediately. Otherwise, // it will need to be bootstrapped before it can be used. @@ -304,6 +339,10 @@ func (db *Database) initRaft() error { return err } + if err := db.recoverIfReplicasRemoved(GetRaftConfiguration, RecoverLocalReplica); err != nil { + return err + } + cmps, err := ConfigureComponents(db.log, db.cfg) if err != nil { return errors.Wrap(err, "failed to configure raft components") @@ -454,7 +493,8 @@ func (db *Database) submitMemberUpdate(op raftOp, m *memberUpdate) error { if err != nil { return err } - db.log.Debugf("member %d:%x updated @ %s", m.Member.Rank, m.Member.Incarnation, common.FormatTime(m.Member.LastUpdate)) + db.log.Debugf("member %d:%x %s @ %s", m.Member.Rank, m.Member.Incarnation, op, + common.FormatTime(m.Member.LastUpdate)) return db.submitRaftUpdate(data) } diff --git a/src/control/system/raft/raft_test.go b/src/control/system/raft/raft_test.go index f294a1785e0..05d67eae2ae 100644 --- a/src/control/system/raft/raft_test.go +++ b/src/control/system/raft/raft_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,6 +8,7 @@ package raft import ( + "net" "testing" "time" @@ -99,3 +101,97 @@ func TestRaft_Database_WaitForLeaderStepUp(t *testing.T) { }) } } + +func TestRaft_recoverIfReplicasRemoved(t *testing.T) { + testReplicaAddrs := func(n int) []*net.TCPAddr { + addrs := make([]*net.TCPAddr, 0, n) + for i := 0; i < n; i++ { + addrs = append(addrs, system.MockControlAddr(t, uint32(i))) + } + return addrs + } + + testRaftServers := func(addrs ...*net.TCPAddr) []raft.Server { + srvs := make([]raft.Server, 0, len(addrs)) + for _, a := range addrs { + srvs = append(srvs, raft.Server{ + ID: raft.ServerID(a.String()), + Address: raft.ServerAddress(a.String()), + }) + } + return srvs + } + + for name, tc := range map[string]struct { + replicaAddrs []*net.TCPAddr + getCfgResult raft.Configuration + getCfgErr error + recoverErr error + expRecoverCalled bool + expErr error + }{ + "get config fails": { + replicaAddrs: testReplicaAddrs(1), + getCfgErr: errors.New("mock getCfg"), + expErr: errors.New("mock getCfg"), + }, + "single server, no change": { + replicaAddrs: testReplicaAddrs(1), + getCfgResult: raft.Configuration{ + Servers: testRaftServers(testReplicaAddrs(1)...), + }, + }, + "multi server, no change": { + replicaAddrs: testReplicaAddrs(5), + getCfgResult: raft.Configuration{ + Servers: testRaftServers(testReplicaAddrs(5)...), + }, + }, + "server added": { + replicaAddrs: testReplicaAddrs(6), + getCfgResult: raft.Configuration{ + Servers: testRaftServers(testReplicaAddrs(5)...), + }, + }, + "server removed": { + replicaAddrs: testReplicaAddrs(3), + getCfgResult: raft.Configuration{ + Servers: testRaftServers(testReplicaAddrs(4)...), + }, + expRecoverCalled: true, + }, + "recover error": { + replicaAddrs: testReplicaAddrs(3), + getCfgResult: raft.Configuration{ + Servers: testRaftServers(testReplicaAddrs(4)...), + }, + recoverErr: errors.New("mock recover"), + expErr: errors.New("mock recover"), + expRecoverCalled: true, + }, + } { + t.Run(name, func(t *testing.T) { + log, _ := logging.NewTestLogger(t.Name()) + ctx := test.MustLogContext(t, log) + + mockGetCfg := func(_ logging.Logger, _ *DatabaseConfig) (raft.Configuration, error) { + return tc.getCfgResult, tc.getCfgErr + } + + recoverCalled := false + mockRecover := func(_ logging.Logger, _ *DatabaseConfig) error { + recoverCalled = true + return tc.recoverErr + } + + db := MockDatabaseWithCfg(t, logging.FromContext(ctx), &DatabaseConfig{ + Replicas: tc.replicaAddrs, + }) + + err := db.recoverIfReplicasRemoved(mockGetCfg, mockRecover) + + test.CmpErr(t, tc.expErr, err) + test.AssertEqual(t, tc.expRecoverCalled, recoverCalled, "") + }) + } +} diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 5c4757afc2e..39c1baf6a1f 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -39,7 +39,7 @@ struct dtx_batched_pool_args { }; struct dtx_batched_cont_args { - /* Link to dss_module_info::dmi_dtx_batched_cont_{open,close}_list. */ + /* Link to dss_module_info::dmi_dtx_batched_cont_open_list when opened. */ d_list_t dbca_sys_link; /* Link to dtx_batched_pool_args::dbpa_cont_list. */ d_list_t dbca_pool_link; @@ -68,7 +68,11 @@ struct dtx_cleanup_cb_args { d_list_t dcca_pc_list; int dcca_st_count; int dcca_pc_count; - uint32_t dcca_cleanup_thd; + union { + uint32_t dcca_cleanup_thd; + uint32_t dcca_version; + }; + bool dcca_for_orphan; }; static inline void @@ -148,6 +152,9 @@ dtx_free_dbca(struct dtx_batched_cont_args *dbca) if (dbca->dbca_agg_req != NULL) sched_req_put(dbca->dbca_agg_req); + D_ASSERTF(d_list_empty(&dbca->dbca_sys_link), "dbca (%p) for " DF_UUID " is still linked\n", + dbca, DP_UUID(dbca->dbca_cont->sc_uuid)); + D_FREE(dbca); cont->sc_dtx_registered = 0; ds_cont_child_put(cont); @@ -204,9 +211,13 @@ dtx_cleanup_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) if (ent->ie_dtx_tgt_cnt == 0) return 0; - /* Stop the iteration if current DTX is not too old. */ - if (dtx_sec2age(ent->ie_dtx_start_time) <= dcca->dcca_cleanup_thd) + if (dcca->dcca_for_orphan) { + if (ent->ie_dtx_ver >= dcca->dcca_version) + return 0; + } else if (dtx_sec2age(ent->ie_dtx_start_time) <= dcca->dcca_cleanup_thd) { + /* Stop the iteration if current DTX is not too old. */ return 1; + } D_ASSERT(ent->ie_dtx_mbs_dsize > 0); @@ -288,12 +299,11 @@ dtx_dpci_free(struct dtx_partial_cmt_item *dpci) D_FREE(dpci); } -static void -dtx_cleanup(void *arg) +int +dtx_cleanup_internal(struct ds_cont_child *cont, struct sched_request *sr, uint32_t thd, + bool for_orphan) { - struct dss_module_info *dmi = dss_get_module_info(); - struct dtx_batched_cont_args *dbca = arg; - struct ds_cont_child *cont = dbca->dbca_cont; + struct dss_module_info *dmi = dss_get_module_info(); struct dtx_share_peer *dsp; struct dtx_partial_cmt_item *dpci; struct dtx_entry *dte; @@ -304,26 +314,26 @@ dtx_cleanup(void *arg) d_list_t act_list; int count; int rc; - - if (dbca->dbca_cleanup_req == NULL) - goto out; + int rc1 = 0; D_INIT_LIST_HEAD(&cmt_list); D_INIT_LIST_HEAD(&abt_list); D_INIT_LIST_HEAD(&act_list); D_INIT_LIST_HEAD(&dcca.dcca_st_list); D_INIT_LIST_HEAD(&dcca.dcca_pc_list); - dcca.dcca_st_count = 0; - dcca.dcca_pc_count = 0; - /* Cleanup stale DTX entries within about 10 seconds windows each time. */ - dcca.dcca_cleanup_thd = dbca->dbca_cleanup_thd - 10; + dcca.dcca_st_count = 0; + dcca.dcca_pc_count = 0; + dcca.dcca_cleanup_thd = thd; + dcca.dcca_for_orphan = for_orphan; rc = ds_cont_iter(cont->sc_pool->spc_hdl, cont->sc_uuid, dtx_cleanup_iter_cb, &dcca, VOS_ITER_DTX, 0); - if (rc < 0) - D_WARN("Failed to scan DTX entry for cleanup " - DF_UUID": "DF_RC"\n", DP_UUID(cont->sc_uuid), DP_RC(rc)); + if (rc < 0) { + D_ERROR("Failed to iter DTX for cleanup %s on " DF_UUID ", ver %u: " DF_RC "\n", + for_orphan ? "orphan" : "stale", DP_UUID(cont->sc_uuid), thd, DP_RC(rc)); + goto out; + } - while (!dss_ult_exiting(dbca->dbca_cleanup_req) && !d_list_empty(&dcca.dcca_st_list)) { + while ((sr == NULL || !dss_ult_exiting(sr)) && !d_list_empty(&dcca.dcca_st_list)) { if (dcca.dcca_st_count > DTX_REFRESH_MAX) { count = DTX_REFRESH_MAX; dcca.dcca_st_count -= DTX_REFRESH_MAX; @@ -338,8 +348,11 @@ dtx_cleanup(void *arg) * that all the DTX entries in the check list will be handled * even if some former ones hit failure. */ - rc = dtx_refresh_internal(cont, &count, &dcca.dcca_st_list, - &cmt_list, &abt_list, &act_list, false); + rc = dtx_refresh_internal(cont, &count, &dcca.dcca_st_list, &cmt_list, &abt_list, + &act_list, for_orphan ? DRI_ORPHAN : DRI_STALE); + if (rc != 0 && rc1 == 0) + rc1 = rc; + D_ASSERTF(count == 0, "%d entries are not handled: "DF_RC"\n", count, DP_RC(rc)); } @@ -348,7 +361,7 @@ dtx_cleanup(void *arg) D_ASSERT(d_list_empty(&abt_list)); D_ASSERT(d_list_empty(&act_list)); - while (!dss_ult_exiting(dbca->dbca_cleanup_req) && !d_list_empty(&dcca.dcca_pc_list)) { + while ((sr == NULL || !dss_ult_exiting(sr)) && !d_list_empty(&dcca.dcca_st_list)) { dpci = d_list_pop_entry(&dcca.dcca_pc_list, struct dtx_partial_cmt_item, dpci_link); dcca.dcca_pc_count--; @@ -378,6 +391,7 @@ dtx_cleanup(void *arg) dtx_dpci_free(dpci); } +out: while ((dsp = d_list_pop_entry(&dcca.dcca_st_list, struct dtx_share_peer, dsp_link)) != NULL) dtx_dsp_free(dsp); @@ -386,9 +400,21 @@ dtx_cleanup(void *arg) dpci_link)) != NULL) dtx_dpci_free(dpci); - dbca->dbca_cleanup_done = 1; + return rc != 0 ? rc : rc1; +} + +static void +dtx_cleanup(void *arg) +{ + struct dtx_batched_cont_args *dbca = arg; + + if (dbca->dbca_cleanup_req != NULL) { + /* Cleanup stale DTX entries within about 10 seconds windows each time. */ + dtx_cleanup_internal(dbca->dbca_cont, dbca->dbca_cleanup_req, + dbca->dbca_cleanup_thd - 10, false); + dbca->dbca_cleanup_done = 1; + } -out: dtx_put_dbca(dbca); } @@ -621,12 +647,9 @@ dtx_batched_commit_one(void *arg) DAOS_EPOCH_MAX, false, &dtes, NULL, &dce); if (cnt == 0) { if (dbca->dbca_flush_pending) { - D_ASSERT(!dtx_cont_opened(cont)); - dbca->dbca_flush_pending = 0; - d_list_del(&dbca->dbca_sys_link); - d_list_add_tail(&dbca->dbca_sys_link, - &dmi->dmi_dtx_batched_cont_close_list); + if (likely(dbca->dbca_deregister == 0)) + d_list_del_init(&dbca->dbca_sys_link); } break; } @@ -714,8 +737,7 @@ dtx_batched_commit(void *arg) dtx_get_dbca(dbca); cont = dbca->dbca_cont; - d_list_move_tail(&dbca->dbca_sys_link, - &dmi->dmi_dtx_batched_cont_open_list); + d_list_move_tail(&dbca->dbca_sys_link, &dmi->dmi_dtx_batched_cont_open_list); dtx_stat(cont, &stat); if (dbca->dbca_commit_req != NULL && dbca->dbca_commit_done) { @@ -852,18 +874,19 @@ dtx_shares_fini(struct dtx_handle *dth) int dtx_handle_reinit(struct dtx_handle *dth) { + D_ASSERT(dth->dth_aborted == 0); + D_ASSERT(dth->dth_already == 0); + if (dth->dth_modification_cnt > 0) { D_ASSERT(dth->dth_ent != NULL); D_ASSERT(dth->dth_pinned != 0); } - D_ASSERT(dth->dth_already == 0); - dth->dth_modify_shared = 0; - dth->dth_active = 0; + dth->dth_modify_shared = 0; + dth->dth_active = 0; dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_cos_done = 0; - dth->dth_aborted = 0; + dth->dth_local_tx_started = 0; + dth->dth_cos_done = 0; dth->dth_op_seq = 0; dth->dth_oid_cnt = 0; @@ -1267,7 +1290,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re struct dtx_memberships *mbs; size_t size; uint32_t flags; - int status = -1; + int status; int rc = 0; bool aborted = false; @@ -1284,41 +1307,16 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re if (dth->dth_solo) goto out; - if (dth->dth_need_validation) { - /* During waiting for bulk data transfer or other non-leaders, the DTX - * status may be changes by others (such as DTX resync or DTX refresh) - * by race. Let's check it before handling the case of 'result < 0' to - * avoid aborting 'ready' one. - */ - status = vos_dtx_validation(dth); - if (unlikely(status == DTX_ST_COMMITTED || status == DTX_ST_COMMITTABLE || - status == DTX_ST_COMMITTING)) - D_GOTO(out, result = -DER_ALREADY); - } - - if (result < 0) + /* During waiting for bulk data transfer or other non-leaders, the DTX status maybe + * changes by others (such as DTX resync or DTX refresh) by race. Let's check it + * before handling the case of 'result < 0' to avoid aborting 'ready' one. + */ + status = vos_dtx_validation(dth); + if (status != -DER_ALREADY && result < 0) goto abort; - switch (status) { - case -1: - break; - case DTX_ST_PREPARED: - if (likely(!dth->dth_aborted)) - break; - /* Fall through */ - case DTX_ST_INITED: - case DTX_ST_PREPARING: - aborted = true; - result = -DER_AGAIN; - goto out; - case DTX_ST_ABORTED: - case DTX_ST_ABORTING: - aborted = true; - result = -DER_INPROGRESS; - goto out; - default: - D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", DP_DTI(&dth->dth_xid), status); - } + if (status < 0) + D_GOTO(out, result = status); if (dlh->dlh_relay) goto out; @@ -1677,13 +1675,10 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db out: if (rc < 0) { - D_ERROR(DF_UUID": Fail to flush CoS cache: rc = %d\n", - DP_UUID(cont->sc_uuid), rc); - if (likely(!dtx_cont_opened(cont))) { - d_list_del(&dbca->dbca_sys_link); - /* Give it to the batched commit for further handling asynchronously. */ + D_ERROR(DF_UUID ": Fail to flush CoS cache: rc = %d\n", DP_UUID(cont->sc_uuid), rc); + if (likely(d_list_empty(&dbca->dbca_sys_link) && dbca->dbca_deregister == 0)) + /* Add it to the batched commit for further handling asynchronously. */ d_list_add_tail(&dbca->dbca_sys_link, &dmi->dmi_dtx_batched_cont_open_list); - } } else { dbca->dbca_flush_pending = 0; } @@ -1855,7 +1850,7 @@ dtx_cont_register(struct ds_cont_child *cont) dbca->dbca_cont = cont; dbca->dbca_pool = dbpa; dbca->dbca_agg_gen = tls->dt_agg_gen; - d_list_add_tail(&dbca->dbca_sys_link, &dmi->dmi_dtx_batched_cont_close_list); + D_INIT_LIST_HEAD(&dbca->dbca_sys_link); d_list_add_tail(&dbca->dbca_pool_link, &dbpa->dbpa_cont_list); if (new_pool) d_list_add_tail(&dbpa->dbpa_sys_link, &dmi->dmi_dtx_batched_pool_list); @@ -1918,10 +1913,14 @@ dtx_cont_open(struct ds_cont_child *cont) if (rc != 0) return rc; - dbca->dbca_flush_pending = 0; - d_list_del(&dbca->dbca_sys_link); - d_list_add_tail(&dbca->dbca_sys_link, - &dmi->dmi_dtx_batched_cont_open_list); + if (unlikely(dbca->dbca_deregister == 1)) + return -DER_SHUTDOWN; + + if (dbca->dbca_flush_pending) + dbca->dbca_flush_pending = 0; + else + d_list_add_tail(&dbca->dbca_sys_link, + &dmi->dmi_dtx_batched_cont_open_list); return 0; } } @@ -1953,14 +1952,13 @@ dtx_cont_close(struct ds_cont_child *cont, bool force) stop_dtx_reindex_ult(cont, force); /* To handle potentially re-open by race. */ - if (unlikely(dtx_cont_opened(cont))) { - dtx_put_dbca(dbca); - return; - } + if (unlikely(dtx_cont_opened(cont))) + goto put; + + if (unlikely(dbca->dbca_deregister == 1)) + goto put; - d_list_del(&dbca->dbca_sys_link); - d_list_add_tail(&dbca->dbca_sys_link, - &dmi->dmi_dtx_batched_cont_close_list); + d_list_del_init(&dbca->dbca_sys_link); dtx_flush_on_close(dmi, dbca); @@ -1974,6 +1972,7 @@ dtx_cont_close(struct ds_cont_child *cont, bool force) if (likely(!dtx_cont_opened(cont) && cont->sc_dtx_delay_reset == 0)) vos_dtx_cache_reset(cont->sc_hdl, false); +put: dtx_put_dbca(dbca); return; } diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index ec6d6fdcedb..9f25c3b19a6 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -259,6 +259,7 @@ extern struct crt_proto_format dtx_proto_fmt; extern btr_ops_t dbtree_dtx_cf_ops; extern btr_ops_t dtx_btr_cos_ops; +/* clang-format off */ /* dtx_common.c */ int dtx_handle_reinit(struct dtx_handle *dth); void dtx_batched_commit(void *arg); @@ -267,6 +268,8 @@ int start_dtx_reindex_ult(struct ds_cont_child *cont); void dtx_merge_check_result(int *tgt, int src); int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, daos_unit_oid_t *oid, uint32_t version, struct pool_target **p_tgt); +int dtx_cleanup_internal(struct ds_cont_child *cont, struct sched_request *sr, uint32_t thd, + bool for_orphan); /* dtx_cos.c */ int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, @@ -289,7 +292,8 @@ int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); int dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *check_list, - d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, bool for_io); + d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, + uint32_t intent); int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, uint64_t dkey_hash, daos_epoch_t epoch, int *tgt_array, int *err); @@ -300,6 +304,7 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); +/* clang-format on */ enum dtx_status_handle_result { DSHR_NEED_COMMIT = 1, @@ -310,8 +315,15 @@ enum dtx_status_handle_result { }; enum dtx_rpc_flags { - DRF_INITIAL_LEADER = (1 << 0), - DRF_SYNC_COMMIT = (1 << 1), + DRF_INITIAL_LEADER = (1 << 0), + DRF_SYNC_COMMIT = (1 << 1), + DRF_FOR_ORPHAN = (1 << 2), +}; + +enum dtx_refresh_intent { + DRI_IO = 1, + DRI_STALE = 2, + DRI_ORPHAN = 3, }; enum dtx_cos_flags { diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 9478ad4e96e..444e5a143e0 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -588,9 +588,8 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) } int -dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, bool block) +dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool block) { - struct ds_cont_child *cont = NULL; struct ds_pool *pool; struct pool_target *target; struct dtx_resync_args dra = { 0 }; @@ -598,23 +597,16 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b int rc = 0; int rc1 = 0; - rc = ds_cont_child_lookup(po_uuid, co_uuid, &cont); - if (rc != 0) { - D_ERROR("Failed to open container for resync DTX " - DF_UUID"/"DF_UUID": rc = %d\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), rc); - return rc; - } - - D_DEBUG(DB_MD, "Enter DTX resync (%s) for "DF_UUID"/"DF_UUID" with ver %u\n", - block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + D_DEBUG(DB_MD, "Enter DTX resync (%s) for " DF_UUID "/" DF_UUID " with ver %u\n", + block ? "sync" : "async", DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), ver); crt_group_rank(NULL, &myrank); pool = cont->sc_pool->spc_pool; if (pool->sp_disable_dtx_resync) { D_DEBUG(DB_MD, "Skip DTX resync (%s) for " DF_UUID "/" DF_UUID " with ver %u\n", - block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + block ? "sync" : "async", DP_UUID(cont->sc_pool_uuid), + DP_UUID(cont->sc_uuid), ver); goto out; } @@ -625,8 +617,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b if (target->ta_comp.co_status == PO_COMP_ST_UP) { dra.discard_version = target->ta_comp.co_in_ver; - D_DEBUG(DB_MD, "DTX resync for "DF_UUID"/"DF_UUID" discard version: %u\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), dra.discard_version); + D_DEBUG(DB_MD, "DTX resync for " DF_UUID "/" DF_UUID " discard version: %u\n", + DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), dra.discard_version); } ABT_rwlock_unlock(pool->sp_lock); @@ -638,8 +630,7 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b ABT_mutex_unlock(cont->sc_mutex); goto out; } - D_DEBUG(DB_TRACE, "Waiting for resync of "DF_UUID"\n", - DP_UUID(co_uuid)); + D_DEBUG(DB_TRACE, "Waiting for resync of " DF_UUID "\n", DP_UUID(cont->sc_uuid)); ABT_cond_wait(cont->sc_dtx_resync_cond, cont->sc_mutex); } @@ -681,10 +672,10 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b } } - D_DEBUG(DB_MD, "Start DTX resync (%s) scan for "DF_UUID"/"DF_UUID" with ver %u\n", - block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + D_DEBUG(DB_MD, "Start DTX resync (%s) scan for " DF_UUID "/" DF_UUID " with ver %u\n", + block ? "sync" : "async", DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), ver); - rc = ds_cont_iter(po_hdl, co_uuid, dtx_iter_cb, &dra, VOS_ITER_DTX, 0); + rc = ds_cont_iter(po_hdl, cont->sc_uuid, dtx_iter_cb, &dra, VOS_ITER_DTX, 0); /* Handle the DTXs that have been scanned even if some failure happened * in above ds_cont_iter() step. @@ -699,8 +690,10 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b if (rc >= 0) vos_set_dtx_resync_version(cont->sc_hdl, ver); - D_DEBUG(DB_MD, "Stop DTX resync (%s) scan for "DF_UUID"/"DF_UUID" with ver %u: rc = %d\n", - block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MD, rc, + "Stop DTX resync (%s) scan for " DF_UUID "/" DF_UUID " with ver %u", + block ? "sync" : "async", DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + ver); fail: ABT_mutex_lock(cont->sc_mutex); @@ -709,10 +702,11 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b ABT_mutex_unlock(cont->sc_mutex); out: - D_DEBUG(DB_MD, "Exit DTX resync (%s) for "DF_UUID"/"DF_UUID" with ver %u, rc = %d\n", - block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MD, rc, + "Exit DTX resync (%s) scan for " DF_UUID "/" DF_UUID " with ver %u", + block ? "sync" : "async", DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + ver); - ds_cont_child_put(cont); return rc > 0 ? 0 : rc; } @@ -726,24 +720,41 @@ container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, vos_iter_type_t type, vos_iter_param_t *iter_param, void *data, unsigned *acts) { - struct dtx_container_scan_arg *scan_arg = data; - struct dtx_scan_args *arg = &scan_arg->arg; - int rc; + struct dtx_container_scan_arg *scan_arg = data; + struct dtx_scan_args *arg = &scan_arg->arg; + struct ds_cont_child *cont = NULL; + int rc; - if (uuid_compare(scan_arg->co_uuid, entry->ie_couuid) == 0) { - D_DEBUG(DB_REBUILD, DF_UUID" already scan\n", - DP_UUID(scan_arg->co_uuid)); + if (uuid_compare(scan_arg->co_uuid, entry->ie_couuid) == 0) return 0; - } + + rc = ds_cont_child_lookup(arg->pool_uuid, entry->ie_couuid, &cont); + if (rc != 0) + goto out; uuid_copy(scan_arg->co_uuid, entry->ie_couuid); - rc = dtx_resync(iter_param->ip_hdl, arg->pool_uuid, entry->ie_couuid, arg->version, true); - if (rc) - D_ERROR(DF_UUID" dtx resync failed: rc %d\n", - DP_UUID(arg->pool_uuid), rc); + if (arg->for_orphan) { +again: + rc = dtx_cleanup_internal(cont, NULL, arg->version, true); + if (rc == -DER_INPROGRESS || rc == -DER_OOG || rc == -DER_HG) { + D_WARN("Cleanup DTX for " DF_UUID "/" DF_UUID " is blocked " DF_RC "\n", + DP_UUID(arg->pool_uuid), DP_UUID(entry->ie_couuid), DP_RC(rc)); + ABT_thread_yield(); + goto again; + } + } else { + rc = dtx_resync(iter_param->ip_hdl, cont, arg->version, true); + } + if (rc == 0) + /* Since dtx_{cleanup,resync} might yield, let's reprobe anyway */ + *acts |= VOS_ITER_CB_YIELD; - /* Since dtx_resync might yield, let's reprobe anyway */ - *acts |= VOS_ITER_CB_YIELD; + ds_cont_child_put(cont); + +out: + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MD, rc, "%s DTX for " DF_UUID "/" DF_UUID, + arg->for_orphan ? "cleanup" : "resync", DP_UUID(arg->pool_uuid), + DP_UUID(entry->ie_couuid)); return rc; } @@ -829,10 +840,29 @@ dtx_resync_ult(void *data) D_ERROR("dtx resync collective "DF_UUID" %d.\n", DP_UUID(arg->pool_uuid), rc); } - pool->sp_dtx_resync_version = arg->version; + + if (pool->sp_dtx_resync_version < arg->version) + pool->sp_dtx_resync_version = arg->version; out: if (pool != NULL) ds_pool_put(pool); D_FREE(arg); } + +int +dtx_cleanup_orphan(uuid_t po_uuid, uint32_t pm_ver) +{ + struct dtx_scan_args arg; + int rc = 0; + + uuid_copy(arg.pool_uuid, po_uuid); + arg.version = pm_ver; + arg.for_orphan = true; + rc = dtx_resync_one(&arg); + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MD, rc, "DTX cleanup orphan for " DF_UUID " with ver %u", + DP_UUID(po_uuid), pm_ver); + + return rc; +} diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 17d414ba70c..ecd4f8d3407 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -277,6 +277,9 @@ dtx_req_send(struct dtx_req_rec *drr, daos_epoch_t epoch) if (dra->dra_opc == DTX_REFRESH) { if (DAOS_FAIL_CHECK(DAOS_DTX_RESYNC_DELAY)) rc = crt_req_set_timeout(req, 3); + else if (drr->drr_flags != NULL && drr->drr_flags[0] & DRF_FOR_ORPHAN) + /* DRF_FOR_ORPHAN case may need longer timeout. */ + rc = crt_req_set_timeout(req, 60); else /* * If related DTX is committable, then it will be committed @@ -983,7 +986,7 @@ dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) int dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *check_list, - d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, bool for_io) + d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, uint32_t intent) { struct ds_pool *pool = cont->sc_pool->spc_pool; struct pool_target *target; @@ -1001,6 +1004,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che int count; int i; bool drop; + bool for_io = (intent == DRI_IO); D_INIT_LIST_HEAD(&head); D_INIT_LIST_HEAD(&self); @@ -1013,7 +1017,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (dsp->dsp_mbs == NULL) { rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc < 0 && rc != -DER_NONEXIST && for_io) { + if (rc < 0 && rc != -DER_NONEXIST && + (intent == DRI_IO || intent == DRI_ORPHAN)) { D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(&dsp->dsp_xid), DP_RC(rc)); goto out; @@ -1035,7 +1040,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che */ D_WARN("Failed to find DTX leader for "DF_DTI", ver %d: "DF_RC"\n", DP_DTI(&dsp->dsp_xid), pool->sp_map_version, DP_RC(rc)); - if (for_io) + if (intent == DRI_IO || intent == DRI_ORPHAN) goto out; drop = true; @@ -1046,14 +1051,21 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che * * 1. In DTX resync, the status may be resolved sometime later. * 2. The DTX resync is done, but failed to handle related DTX. + * 3. For orphan cleanup, that is almost impossible unless another + * pool map changes between DTX resync and DTX orphan cleanup. + * Under such case, another DTX resync will be triggered. For + * current DTX orphan cleanup, just ignore such case. */ if (myrank == target->ta_comp.co_rank && dss_get_module_info()->dmi_tgt_id == target->ta_comp.co_index) { d_list_del(&dsp->dsp_link); - if (for_io) + if (for_io) { d_list_add_tail(&dsp->dsp_link, &self); - else + } else { + D_WARN("Hit self leader for DTX " DF_DTI " when cleanup\n", + DP_DTI(&dsp->dsp_xid)); dtx_dsp_free(dsp); + } if (--(*check_count) == 0) break; continue; @@ -1080,6 +1092,9 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che else flags = 0; + if (intent == DRI_ORPHAN) + flags |= DRF_FOR_ORPHAN; + d_list_for_each_entry(drr, &head, drr_link) { if (drr->drr_rank == target->ta_comp.co_rank && drr->drr_tag == target->ta_comp.co_index) { @@ -1363,59 +1378,16 @@ dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont) if (DAOS_FAIL_CHECK(DAOS_DTX_NO_RETRY)) return -DER_IO; - rc = dtx_refresh_internal(cont, &dth->dth_share_tbd_count, - &dth->dth_share_tbd_list, - &dth->dth_share_cmt_list, - &dth->dth_share_abt_list, - &dth->dth_share_act_list, true); - - /* If we can resolve the DTX status, then return -DER_AGAIN - * to the caller that will retry related operation locally. - */ + rc = dtx_refresh_internal(cont, &dth->dth_share_tbd_count, &dth->dth_share_tbd_list, + &dth->dth_share_cmt_list, &dth->dth_share_abt_list, + &dth->dth_share_act_list, DRI_IO); if (rc == 0) { D_ASSERT(dth->dth_share_tbd_count == 0); - if (dth->dth_need_validation) { - rc = vos_dtx_validation(dth); - switch (rc) { - case DTX_ST_INITED: - if (!dth->dth_aborted) - break; - /* Fall through */ - case DTX_ST_PREPARED: - case DTX_ST_PREPARING: - /* The DTX has been ever aborted and related resent RPC - * is in processing. Return -DER_AGAIN to make this ULT - * to retry sometime later without dtx_abort(). - */ - rc = -DER_AGAIN; - break; - case DTX_ST_ABORTED: - D_ASSERT(dth->dth_ent == NULL); - /* Aborted, return -DER_INPROGRESS for client retry. - * - * Fall through. - */ - case DTX_ST_ABORTING: - rc = -DER_INPROGRESS; - break; - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTING: - case DTX_ST_COMMITTABLE: - /* Aborted then prepared/committed by race. - * Return -DER_ALREADY to avoid repeated modification. - */ - dth->dth_already = 1; - rc = -DER_ALREADY; - break; - default: - D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", - DP_DTI(&dth->dth_xid), rc); - } - } else { + rc = vos_dtx_validation(dth); + if (rc == 0) { vos_dtx_cleanup(dth, false); - dtx_handle_reinit(dth); - rc = -DER_AGAIN; + rc = dtx_handle_reinit(dth); } } diff --git a/src/engine/srv.c b/src/engine/srv.c index 1c2ba0d096e..68c325d5f96 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -418,7 +418,6 @@ dss_srv_handler(void *arg) dmi->dmi_tgt_id = dx->dx_tgt_id; dmi->dmi_ctx_id = -1; D_INIT_LIST_HEAD(&dmi->dmi_dtx_batched_cont_open_list); - D_INIT_LIST_HEAD(&dmi->dmi_dtx_batched_cont_close_list); D_INIT_LIST_HEAD(&dmi->dmi_dtx_batched_pool_list); (void)pthread_setname_np(pthread_self(), dx->dx_name); diff --git a/src/include/daos/object.h b/src/include/daos/object.h index 106daeec4f5..feb59d24d0c 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -628,29 +628,31 @@ dc_obj_shard2anchor(daos_anchor_t *anchor, uint32_t shard) enum daos_io_flags { /* The RPC will be sent to leader replica. */ - DIOF_TO_LEADER = 0x1, + DIOF_TO_LEADER = 0x1, /* The RPC will be sent to specified replica. */ - DIOF_TO_SPEC_SHARD = 0x2, + DIOF_TO_SPEC_SHARD = 0x2, /* The operation (enumeration) has specified epoch. */ - DIOF_WITH_SPEC_EPOCH = 0x4, + DIOF_WITH_SPEC_EPOCH = 0x4, /* The operation is for EC recovering. */ - DIOF_EC_RECOV = 0x8, + DIOF_EC_RECOV = 0x8, /* The key existence. */ - DIOF_CHECK_EXISTENCE = 0x10, + DIOF_CHECK_EXISTENCE = 0x10, /* The RPC will be sent to specified redundancy group. */ - DIOF_TO_SPEC_GROUP = 0x20, + DIOF_TO_SPEC_GROUP = 0x20, /* For data migration. */ - DIOF_FOR_MIGRATION = 0x40, + DIOF_FOR_MIGRATION = 0x40, /* For EC aggregation. */ - DIOF_FOR_EC_AGG = 0x80, + DIOF_FOR_EC_AGG = 0x80, /* The operation is for EC snapshot recovering */ - DIOF_EC_RECOV_SNAP = 0x100, + DIOF_EC_RECOV_SNAP = 0x100, /* Only recover from parity */ DIOF_EC_RECOV_FROM_PARITY = 0x200, /* Force fetch/list to do degraded enumeration/fetch */ DIOF_FOR_FORCE_DEGRADE = 0x400, /* reverse enumeration for recx */ DIOF_RECX_REVERSE = 0x800, + /* Use for rebuild fetch epoch selection */ + DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY = 0x1000, }; /** diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 818f18576fd..60f24d619c9 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -75,24 +75,22 @@ void dss_set_start_epoch(void); bool dss_has_enough_helper(void); struct dss_module_info { - crt_context_t dmi_ctx; - struct bio_xs_context *dmi_nvme_ctxt; - struct dss_xstream *dmi_xstream; + crt_context_t dmi_ctx; + struct bio_xs_context *dmi_nvme_ctxt; + struct dss_xstream *dmi_xstream; /* the xstream id */ - int dmi_xs_id; + int dmi_xs_id; /* the VOS target id */ - int dmi_tgt_id; + int dmi_tgt_id; /* the cart context id */ - int dmi_ctx_id; - uint32_t dmi_dtx_batched_started:1, - dmi_srv_shutting_down:1; - d_list_t dmi_dtx_batched_cont_open_list; - d_list_t dmi_dtx_batched_cont_close_list; - d_list_t dmi_dtx_batched_pool_list; + int dmi_ctx_id; + uint32_t dmi_dtx_batched_started : 1, dmi_srv_shutting_down : 1; + d_list_t dmi_dtx_batched_cont_open_list; + d_list_t dmi_dtx_batched_pool_list; /* the profile information */ - struct daos_profile *dmi_dp; - struct sched_request *dmi_dtx_cmt_req; - struct sched_request *dmi_dtx_agg_req; + struct daos_profile *dmi_dp; + struct sched_request *dmi_dtx_cmt_req; + struct sched_request *dmi_dtx_agg_req; }; extern struct dss_module_key daos_srv_modkey; diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index e5ae00a8880..dc18c53553f 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -436,9 +436,13 @@ dtx_is_real_handle(const struct dtx_handle *dth) struct dtx_scan_args { uuid_t pool_uuid; uint32_t version; + bool for_orphan; }; -int dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, bool block); +/* clang-format off */ +int dtx_cleanup_orphan(uuid_t po_uuid, uint32_t pm_ver); +int dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool block); void dtx_resync_ult(void *arg); +/* clang-format on */ #endif /* __DAOS_DTX_SRV_H__ */ diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index dfe823af4e2..9cf67485cd6 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -6167,6 +6167,8 @@ dc_obj_fetch_task(tse_task_t *task) if (args->extra_flags & DIOF_EC_RECOV_FROM_PARITY) obj_auxi->flags |= ORF_EC_RECOV_FROM_PARITY; + if (args->extra_flags & DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY) + obj_auxi->flags |= ORF_FETCH_EPOCH_EC_AGG_BOUNDARY; if (args->extra_flags & DIOF_FOR_FORCE_DEGRADE || DAOS_FAIL_CHECK(DAOS_OBJ_FORCE_DEGRADE)) diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 2aa94e15434..9f385b98f52 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1180,6 +1180,16 @@ dc_obj_shard_rw(struct dc_obj_shard *shard, enum obj_rpc_opc opc, orw->orw_api_flags = api_args->flags; orw->orw_epoch = auxi->epoch.oe_value; orw->orw_epoch_first = auxi->epoch.oe_first; + if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) { + D_ASSERTF(auxi->epoch.oe_value == auxi->epoch.oe_first, + "bad epoch.oe_value " DF_X64 ", epoch.oe_first " DF_X64 "\n", + auxi->epoch.oe_value, auxi->epoch.oe_first); + D_ASSERT(api_args->extra_arg != NULL); + orw->orw_epoch_first = (uintptr_t)api_args->extra_arg; + D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first, + "bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n", orw->orw_epoch, + orw->orw_epoch_first); + } orw->orw_dkey_hash = auxi->obj_auxi->dkey_hash; orw->orw_nr = nr; orw->orw_dkey = *dkey; diff --git a/src/object/obj_rpc.h b/src/object/obj_rpc.h index a9dce1001bd..d4fad534a5f 100644 --- a/src/object/obj_rpc.h +++ b/src/object/obj_rpc.h @@ -191,6 +191,8 @@ enum obj_rpc_flags { ORF_EMPTY_SGL = (1 << 24), /* The CPD RPC only contains read-only transaction. */ ORF_CPD_RDONLY = (1 << 25), + /* Use for rebuild fetch epoch selection */ + ORF_FETCH_EPOCH_EC_AGG_BOUNDARY = (1 << 26), }; /* clang-format on */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index c459bb335ed..b7f9b1a3424 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -1787,26 +1787,7 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io * Let's check resent again before further process. */ if (rc == 0 && obj_rpc_is_update(rpc) && sched_cur_seq() != sched_seq) { - if (dth->dth_need_validation) { - daos_epoch_t epoch = 0; - int rc1; - - rc1 = dtx_handle_resend(ioc->ioc_vos_coh, &orw->orw_dti, &epoch, NULL); - switch (rc1) { - case 0: - orw->orw_epoch = epoch; - /* Fall through */ - case -DER_ALREADY: - rc = -DER_ALREADY; - break; - case -DER_NONEXIST: - case -DER_EP_OLD: - break; - default: - rc = rc1; - break; - } - } + rc = vos_dtx_validation(dth); /* For solo update, it will be handled via one-phase transaction. * If there is CPU yield after its epoch generated, we will renew @@ -1832,7 +1813,7 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io } /* re-generate the recx_list if some akeys skipped */ - if (skips != NULL && orwo->orw_rels.ca_arrays != NULL && orw->orw_nr != iods_nr) + if (rc == 0 && skips != NULL && orwo->orw_rels.ca_arrays != NULL && orw->orw_nr != iods_nr) rc = obj_rw_recx_list_post(orw, orwo, skips, rc); rc = obj_rw_complete(rpc, ioc, ioh, rc, dth); @@ -1847,7 +1828,7 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io } if (iods_dup != NULL) daos_iod_recx_free(iods_dup, iods_nr); - return unlikely(rc == -DER_ALREADY) ? 0 : rc; + return rc; } /* Extract local iods/offs/csums by orw_oid.id_shard from @orw */ @@ -2085,7 +2066,7 @@ obj_local_rw(crt_rpc_t *rpc, struct obj_io_context *ioc, struct dtx_handle *dth) if (dth != NULL && obj_dtx_need_refresh(dth, rc)) { if (++retry < 3) { rc = dtx_refresh(dth, ioc->ioc_coc); - if (rc == -DER_AGAIN) + if (rc == 0) goto again; } else if (orw->orw_flags & ORF_MAYBE_STARVE) { dsp = d_list_entry(dth->dth_share_tbd_list.next, struct dtx_share_peer, @@ -2798,7 +2779,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) rc = obj_local_rw(rpc, &ioc, dth); if (rc != 0) DL_CDEBUG( - rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || rc == -DER_ALREADY || (rc == -DER_EXIST && (orw->orw_api_flags & (DAOS_COND_DKEY_INSERT | DAOS_COND_AKEY_INSERT))) || (rc == -DER_NONEXIST && @@ -2966,6 +2947,36 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (obj_rpc_is_fetch(rpc)) { struct dtx_handle *dth; + /* ORF_FETCH_EPOCH_EC_AGG_BOUNDARY only used for rebuild fetch. The container's + * sc_ec_agg_eph_boundary possibly be different on the initiator and target engines + * of the rebuild fetch, initiator selected fetch epoch possibly lower than readable + * epoch at target engine side if vos aggregation merged adjacent extents to higher + * epoch. For this case increase the fetch epoch to sc_ec_agg_eph_boundary. + */ + if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) { + uint64_t rebuild_epoch; + + D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first, + "bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n", + orw->orw_epoch, orw->orw_epoch_first); + rebuild_epoch = orw->orw_epoch_first; + if (orw->orw_epoch < ioc.ioc_coc->sc_ec_agg_eph_boundary) { + uint64_t fetch_epoch; + + /* Both EC and VOS aggregation don't across rebuild epoch */ + fetch_epoch = + min(ioc.ioc_coc->sc_ec_agg_eph_boundary, rebuild_epoch); + D_DEBUG(DB_IO, + DF_UOID " increase fetch epoch from " DF_X64 " to " DF_X64 + ", sc_ec_agg_eph_boundary: " DF_X64 + ", rebuild_epoch: " DF_X64 "\n", + DP_UOID(orw->orw_oid), orw->orw_epoch, fetch_epoch, + ioc.ioc_coc->sc_ec_agg_eph_boundary, rebuild_epoch); + orw->orw_epoch = fetch_epoch; + } + orw->orw_epoch_first = orw->orw_epoch; + } + if (orw->orw_flags & ORF_CSUM_REPORT) { obj_log_csum_err(); D_GOTO(out, rc = 0); @@ -3311,7 +3322,7 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, if (obj_dtx_need_refresh(dth, rc)) { rc = dtx_refresh(dth, ioc->ioc_coc); /* After DTX refresh, re_pack will resume from the position at \@anchors. */ - if (rc == -DER_AGAIN) + if (rc == 0) goto re_pack; } @@ -3619,44 +3630,15 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, uint32_t shard_nr, u } rc = dtx_refresh(dth, ioc->ioc_coc); - if (rc != -DER_AGAIN) + if (rc != 0) goto out; - if (unlikely(sched_cur_seq() == sched_seq)) - goto again; - - /* - * There is CPU yield after DTX start, and the resent RPC may be handled - * during that. Let's check resent again before further process. - */ - - if (dth->dth_need_validation) { - daos_epoch_t epoch = 0; - int rc1; - - rc1 = dtx_handle_resend(ioc->ioc_vos_coh, &opi->opi_dti, &epoch, NULL); - switch (rc1) { - case 0: - opi->opi_epoch = epoch; - /* Fall through */ - case -DER_ALREADY: - rc = -DER_ALREADY; - break; - case -DER_NONEXIST: - case -DER_EP_OLD: - break; - default: - rc = rc1; - break; - } - } - /* * For solo punch, it will be handled via one-phase transaction. If there is CPU * yield after its epoch generated, we will renew the epoch, then we can use the * epoch to sort related solo DTXs based on their epochs. */ - if (rc == -DER_AGAIN && dth->dth_solo) { + if (dth->dth_solo && sched_cur_seq() != sched_seq) { struct dtx_epoch epoch; epoch.oe_value = d_hlc_get(); @@ -3748,8 +3730,8 @@ obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) exec: rc = obj_local_punch(opi, otpa->opc, count, shards, p_ioc, dth); if (rc != 0) - DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || rc == -DER_ALREADY || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); out: @@ -4166,7 +4148,7 @@ obj_local_query(struct obj_tgt_query_args *otqa, struct obj_io_context *ioc, dao p_recx, p_epoch, cell_size, stripe_size, dth); if (obj_dtx_need_refresh(dth, rc)) { rc = dtx_refresh(dth, ioc->ioc_coc); - if (rc == -DER_AGAIN) + if (rc == 0) goto again; } @@ -4779,24 +4761,11 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp * Let's check resent again before further process. */ if (rc == 0 && dth->dth_modification_cnt > 0 && sched_cur_seq() != sched_seq) { - if (dth->dth_need_validation) { - daos_epoch_t epoch = 0; - int rc1; - - rc1 = dtx_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, NULL); - switch (rc1) { - case 0: - case -DER_ALREADY: - D_GOTO(out, rc = -DER_ALREADY); - case -DER_NONEXIST: - case -DER_EP_OLD: - break; - default: - D_GOTO(out, rc = rc1); - } - } + rc = vos_dtx_validation(dth); + if (rc != 0) + goto out; - if (rc == 0 && dth->dth_solo) { + if (dth->dth_solo) { daos_epoch_t epoch = dcsh->dcsh_epoch.oe_value; D_ASSERT(dcde->dcde_read_cnt == 0); @@ -4962,7 +4931,7 @@ ds_cpd_handle_one_wrap(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, if (obj_dtx_need_refresh(dth, rc)) { if (++retry < 3) { rc = dtx_refresh(dth, ioc->ioc_coc); - if (rc == -DER_AGAIN) + if (rc == 0) goto again; } else if (oci->oci_flags & ORF_MAYBE_STARVE) { dsp = d_list_entry(dth->dth_share_tbd_list.next, diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 387d6ebc73f..0e97778ac88 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -487,8 +487,12 @@ migrate_pool_tls_create_one(void *data) pool_child = ds_pool_child_lookup(arg->pool_uuid); if (pool_child == NULL) { - D_ASSERTF(dss_get_module_info()->dmi_xs_id == 0, - "Cannot find the pool "DF_UUIDF"\n", DP_UUID(arg->pool_uuid)); + /* Local ds_pool_child isn't started yet, return a retry-able error */ + if (dss_get_module_info()->dmi_xs_id != 0) { + D_INFO(DF_UUID ": Local VOS pool isn't ready yet.\n", + DP_UUID(arg->pool_uuid)); + return -DER_STALE; + } } else if (unlikely(pool_child->spc_no_storage)) { D_DEBUG(DB_REBUILD, DF_UUID" "DF_UUID" lost pool shard, ver %d, skip.\n", DP_UUID(arg->pool_uuid), DP_UUID(arg->pool_hdl_uuid), arg->version); @@ -747,11 +751,19 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_ daos_iod_t *iods, int iod_num, daos_epoch_t eph, uint32_t flags, d_iov_t *csum_iov_fetch, struct migrate_pool_tls *tls) { - int rc; + uint32_t *extra_arg = NULL; + int rc; + + /* pass rebuild epoch by extra_arg */ + if (flags & DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY) { + D_ASSERTF(eph <= mrone->mo_epoch, "bad eph " DF_X64 ", mo_epoch " DF_X64 "\n", eph, + mrone->mo_epoch); + extra_arg = (uint32_t *)mrone->mo_epoch; + } retry: - rc = dsc_obj_fetch(oh, eph, &mrone->mo_dkey, iod_num, iods, sgls, - NULL, flags, NULL, csum_iov_fetch); + rc = dsc_obj_fetch(oh, eph, &mrone->mo_dkey, iod_num, iods, sgls, NULL, flags, extra_arg, + csum_iov_fetch); if (rc == -DER_TIMEDOUT && tls->mpt_version + 1 >= tls->mpt_pool->spc_map_version) { if (tls->mpt_fini) { @@ -1082,10 +1094,9 @@ migrate_update_parity(struct migrate_one *mrone, daos_epoch_t parity_eph, } static int -__migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, - daos_iod_t *iods, daos_epoch_t fetch_eph, - daos_epoch_t **ephs, uint32_t iods_num, - struct ds_cont_child *ds_cont, bool encode) +__migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iods, + daos_epoch_t fetch_eph, daos_epoch_t **ephs, uint32_t iods_num, + struct ds_cont_child *ds_cont, uint32_t flags, bool encode) { d_sg_list_t sgls[OBJ_ENUM_UNPACK_MAX_IODS]; d_iov_t iov[OBJ_ENUM_UNPACK_MAX_IODS] = { 0 }; @@ -1115,8 +1126,7 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, D_DEBUG(DB_REBUILD, DF_UOID" mrone %p dkey "DF_KEY" nr %d eph "DF_U64"\n", DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), iods_num, mrone->mo_epoch); - rc = mrone_obj_fetch(mrone, oh, sgls, iods, iods_num, fetch_eph, DIOF_FOR_MIGRATION, - NULL); + rc = mrone_obj_fetch(mrone, oh, sgls, iods, iods_num, fetch_eph, flags, NULL); if (rc) { D_ERROR("migrate dkey "DF_KEY" failed: "DF_RC"\n", DP_KEY(&mrone->mo_dkey), DP_RC(rc)); @@ -1220,8 +1230,9 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, update_eph = mrone->mo_iods_update_ephs_from_parity[i][j]; update_eph_p = &update_eph; - rc = __migrate_fetch_update_parity(mrone, oh, &iod, fetch_eph, - &update_eph_p, 1, ds_cont, true); + rc = __migrate_fetch_update_parity( + mrone, oh, &iod, fetch_eph, &update_eph_p, 1, ds_cont, + DIOF_FOR_MIGRATION | DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY, true); if (rc) return rc; } @@ -1230,8 +1241,8 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, /* Otherwise, keep it as replicate recx */ if (mrone->mo_iod_num > 0) { rc = __migrate_fetch_update_parity(mrone, oh, mrone->mo_iods, mrone->mo_epoch, - mrone->mo_iods_update_ephs, - mrone->mo_iod_num, ds_cont, false); + mrone->mo_iods_update_ephs, mrone->mo_iod_num, + ds_cont, DIOF_FOR_MIGRATION, false); } return rc; @@ -1565,13 +1576,15 @@ migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, */ if (ds_cont->sc_ec_agg_eph_boundary > mrone->mo_iods_update_ephs_from_parity[i][j]) - fetch_eph = mrone->mo_epoch; + fetch_eph = min(ds_cont->sc_ec_agg_eph_boundary, mrone->mo_epoch); else fetch_eph = mrone->mo_iods_update_ephs_from_parity[i][j]; - rc = __migrate_fetch_update_bulk(mrone, oh, &iod, 1, fetch_eph, - mrone->mo_iods_update_ephs_from_parity[i][j], - DIOF_EC_RECOV_FROM_PARITY | DIOF_FOR_MIGRATION, - ds_cont); + rc = __migrate_fetch_update_bulk( + mrone, oh, &iod, 1, fetch_eph, + mrone->mo_iods_update_ephs_from_parity[i][j], + DIOF_EC_RECOV_FROM_PARITY | DIOF_FOR_MIGRATION | + DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY, + ds_cont); if (rc != 0) D_GOTO(out, rc); } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 68c57afc669..41a1dac2fe3 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -977,6 +977,11 @@ rebuild_scanner(void *data) if (tls == NULL) return 0; + /* There maybe orphan DTX entries after DTX resync, let's cleanup before rebuild scan. */ + rc = dtx_cleanup_orphan(rpt->rt_pool_uuid, rpt->rt_pool->sp_dtx_resync_version); + if (rc != 0) + D_GOTO(out, rc); + if (!is_rebuild_scanning_tgt(rpt)) { D_DEBUG(DB_REBUILD, DF_UUID " skip scan\n", DP_UUID(rpt->rt_pool_uuid)); D_GOTO(out, rc = 0); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 1af3353d8af..76971149ef8 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2253,7 +2253,7 @@ rebuild_fini_one(void *arg) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); dpc = ds_pool_child_lookup(rpt->rt_pool_uuid); - /* The pool child could be stopped */ + /* The ds_pool_child is already stopped */ if (dpc == NULL) return 0; @@ -2497,9 +2497,11 @@ rebuild_prepare_one(void *data) int rc = 0; dpc = ds_pool_child_lookup(rpt->rt_pool_uuid); - /* The pool child could be stopped */ - if (dpc == NULL) - return 0; + /* Local ds_pool_child isn't started yet, return a retry-able error */ + if (dpc == NULL) { + D_INFO(DF_UUID ": Local VOS pool isn't ready yet.\n", DP_UUID(rpt->rt_pool_uuid)); + return -DER_STALE; + } if (unlikely(dpc->spc_no_storage)) D_GOTO(put, rc = 0); @@ -2666,7 +2668,9 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) D_GOTO(out, rc = -DER_NOMEM); rpt->rt_rebuild_fence = d_hlc_get(); - rc = dss_task_collective(rebuild_prepare_one, rpt, 0); + rc = ds_pool_task_collective(rpt->rt_pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + rebuild_prepare_one, rpt, 0); if (rc) { rpt->rt_rebuild_fence = 0; rebuild_pool_tls_destroy(pool_tls); diff --git a/src/tests/ftest/nvme/enospace.py b/src/tests/ftest/nvme/enospace.py index f0b32edf7bc..4ce7743ef56 100644 --- a/src/tests/ftest/nvme/enospace.py +++ b/src/tests/ftest/nvme/enospace.py @@ -34,7 +34,42 @@ def __init__(self, *args, **kwargs): """Initialize a NvmeEnospace object.""" super().__init__(*args, **kwargs) - self.metric_names = ['engine_pool_vos_space_scm_used', 'engine_pool_vos_space_nvme_used'] + self.space_metric_names = [ + 'engine_pool_vos_space_scm_used', + 'engine_pool_vos_space_nvme_used' + ] + self.aggr_metric_names = [ + # -- Merged records -- + "engine_pool_vos_aggregation_merged_size", + "engine_pool_vos_aggregation_merged_recs", + # -- Deleted records -- + "engine_pool_vos_aggregation_deleted_ev", + "engine_pool_vos_aggregation_deleted_sv", + # -- Errors -- + "engine_pool_vos_aggregation_fail_count", + "engine_pool_vos_aggregation_csum_errors", + "engine_pool_vos_aggregation_uncommitted", + "engine_pool_vos_aggregation_agg_blocked", + "engine_pool_vos_aggregation_discard_blocked", + # -- Details stat counter -- + "engine_pool_vos_aggregation_obj_deleted", + "engine_pool_vos_aggregation_obj_scanned", + "engine_pool_vos_aggregation_obj_skipped", + "engine_pool_vos_aggregation_akey_deleted", + "engine_pool_vos_aggregation_akey_scanned", + "engine_pool_vos_aggregation_akey_skipped", + "engine_pool_vos_aggregation_dkey_deleted", + "engine_pool_vos_aggregation_dkey_scanned", + "engine_pool_vos_aggregation_dkey_skipped", + # -- Duration -- + "engine_pool_vos_aggregation_epr_duration", + "engine_pool_vos_aggregation_epr_duration_max", + "engine_pool_vos_aggregation_epr_duration_mean", + "engine_pool_vos_aggregation_epr_duration_min", + "engine_pool_vos_aggregation_epr_duration_stddev" + ] + self.metric_names = self.space_metric_names + self.aggr_metric_names + self.media_names = ['SCM', 'NVMe'] self.expected_errors = [self.DER_NOSPACE, self.DER_TIMEDOUT] @@ -56,26 +91,31 @@ def setUp(self): self.daos_cmd = DaosCommand(self.bin) self.create_pool_max_size() - def get_pool_space_metrics(self, pool_uuid): + def get_pool_space_metrics(self, pool, metrics): """Return the metrics on space usage of a given pool. Args: - pool_uuid (str): Unique id of a pool. + pool (TestPool): target TestPool. + metrics (dict): telemetry metrics. Returns: dict: metrics on space usage. """ - metrics = {} - for hostname, data in self.telemetry.get_metrics(",".join(self.metric_names)).items(): + pool_uuid = pool.uuid + space_metrics = {} + for hostname, data in metrics.items(): for metric_name, entry in data.items(): - if metric_name not in metrics: - metrics[metric_name] = { + if metric_name not in self.space_metric_names: + continue + + if metric_name not in space_metrics: + space_metrics[metric_name] = { "description": entry['description'], "hosts": {} } - hosts = metrics[metric_name]["hosts"] + hosts = space_metrics[metric_name]["hosts"] for metric in entry['metrics']: if metric['labels']['pool'].casefold() != pool_uuid.casefold(): continue @@ -90,11 +130,60 @@ def get_pool_space_metrics(self, pool_uuid): target = metric['labels']['target'] hosts[hostname][rank][target] = metric['value'] - return metrics + return space_metrics + + def get_pool_aggr_metrics(self, pool, metrics): + """Return the metrics on aggregation counters and gauges. + + Args: + pool (TestPool): target TestPool. + metrics (dict): telemetry metrics. + + Returns: + dict: metrics on aggregation. + + """ + pool_uuid = pool.uuid + aggr_metrics = { + "metric_descriptions": {}, + "metric_values": {} + } + for hostname, data in metrics.items(): + if hostname not in aggr_metrics["metric_values"]: + aggr_metrics["metric_values"][hostname] = {} + hosts = aggr_metrics["metric_values"][hostname] + + for metric_name, entry in data.items(): + if metric_name not in self.aggr_metric_names: + continue + + if metric_name not in aggr_metrics["metric_descriptions"]: + aggr_metrics["metric_descriptions"][metric_name] = entry["description"] + + for metric in entry['metrics']: + if metric['labels']['pool'].casefold() != pool_uuid.casefold(): + continue + + rank = metric['labels']['rank'] + if rank not in hosts: + hosts[rank] = {} + ranks = hosts[rank] + + target = metric['labels']['target'] + if target not in ranks: + ranks[target] = {} + targets = ranks[target] + + targets[metric_name] = metric['value'] + + return aggr_metrics def get_pool_usage(self, pool_space): """Get the pool storage used % for SCM and NVMe. + Args: + pool_space (object): space usage information of a pool. + Returns: list: a list of SCM/NVMe pool space usage in %(float) @@ -107,14 +196,55 @@ def get_pool_usage(self, pool_space): return pool_usage - def display_pool_stats(self, pool_space, pool_space_metrics): - """Display statistics on pool usage. + def display_table(self, title, table, align_idx): + """Pretty print table content. + + Args: + title (str): Title of the table. + table (list): Table to print on stdout. + align_idx (int): Last column to left align. + """ + cols_size = [ + max(i) for i in [[len(row[j]) for row in table] for j in range(len(table[0]))]] + line_size = sum(cols_size) + 3 * (len(cols_size) - 1) + + self.log.debug("") + line = f"{' ' + title + ' ':-^{line_size}}" + self.log.debug(line) + + line = "" + for idx, elt in enumerate(table[0]): + line += f"{elt:^{cols_size[idx]}}" + if idx + 1 != len(table[0]): + line += " | " + self.log.debug(line) + + line = "" + for idx, size in enumerate(cols_size): + line += '-' * size + if idx + 1 != len(cols_size): + line += "-+-" + self.log.debug(line) + + for row in table[1:]: + line = "" + for idx, elt in enumerate(row): + align_op = "<" + if idx > align_idx: + align_op = ">" + line += f"{elt:{align_op}{cols_size[idx]}}" + if idx + 1 != len(row): + line += " | " + self.log.debug(line) + + def display_pool_space(self, pool_space, pool_space_metrics): + """Display space usage statistics of a given pool. Args: pool_space (object): space usage information of a pool. pool_space_metrics (dict): dict of metrics on space usage of a pool. """ - + self.log.debug("") title = f"{' Pool Space Usage ':-^80}" self.log.debug(title) @@ -136,34 +266,65 @@ def display_pool_stats(self, pool_space, pool_space_metrics): for metric in pool_space_metrics.values(): table = [["Hostname", "Rank", "Target", "Size"]] - cols_size = [] - for cell in table[0]: - cols_size.append(len(cell)) for hostname, ranks in metric['hosts'].items(): for rank, targets in ranks.items(): for target, size in targets.items(): row = [hostname, rank, target, get_display_size(size)] table.append(row) - for idx, elt in enumerate(cols_size): - cols_size[idx] = max(elt, len(row[idx])) hostname = "" rank = "" - for idx, elt in enumerate(table[0]): - table[0][idx] = f"{elt:^{cols_size[idx]}}" - row = ' | '.join(table[0]) - title = f"{' ' + metric['description'] + ' ':-^{len(row)}}" - self.log.debug("") - self.log.debug(title) - self.log.debug(row) - self.log.debug("-" * len(row)) - for row in table[1:]: - for idx, elt in enumerate(row): - align_op = "<" - if idx + 1 == len(row): - align_op = ">" - row[idx] = f"{elt:{align_op}{cols_size[idx]}}" - self.log.debug(" | ".join(row)) + self.display_table(metric['description'], table, 2) + + def display_pool_aggregation(self, metrics): + """Display record aggregation statistics of a given pool. + + Args: + metrics (dict): dict of metrics on pool aggregation. + """ + table = [["Hostname", "Rank", "Target"]] + for it in self.aggr_metric_names: + table[0].append(metrics["metric_descriptions"][it]) + + for hostname in sorted(metrics["metric_values"]): + row = [hostname] + + for rank in sorted(metrics["metric_values"][hostname]): + if not row: + row = [""] + row.append(rank) + + for target in sorted(metrics["metric_values"][hostname][rank]): + if not row: + row = ["", ""] + row.append(target) + + idx = 3 + for metric_name in self.aggr_metric_names: + value = metrics["metric_values"][hostname][rank][target][metric_name] + if metric_name == "engine_pool_vos_aggregation_merged_size": + row.append(get_display_size(value)) + else: + row.append(str(value)) + idx += 1 + + table.append(row) + row = None + + self.display_table('Pool Aggregation stats', table, 2) + + def display_stats(self): + """Display usage statistics of the tested pool.""" + self.pool.get_info() + metrics = self.telemetry.get_metrics(",".join(self.metric_names)) + + pool_space = self.pool.info.pi_space + pool_space_metrics = self.get_pool_space_metrics(self.pool, metrics) + self.display_pool_space(pool_space, pool_space_metrics) + + pool_aggr_metrics = self.get_pool_aggr_metrics(self.pool, metrics) + self.display_pool_aggregation(pool_aggr_metrics) + self.log.debug("") def verify_enospace_log(self, log_file): """Function checking logs consistency. @@ -204,10 +365,14 @@ def verify_enospace_log(self, log_file): "Number of errors %s (%s) is > 0: got=%d", c_err_to_str(error), error, errors_count[error]) - def delete_all_containers(self): - """Delete all the containers.""" + def delete_all_containers(self, pool): + """Delete all the containers of a given pool. + + Args: + pool (TestPool): target TestPool. + """ # List all the container - kwargs = {"pool": self.pool.uuid} + kwargs = {"pool": pool.uuid} data = self.daos_cmd.container_list(**kwargs) containers = [uuid_label["uuid"] for uuid_label in data["response"]] @@ -288,17 +453,22 @@ def run_enospace_foreground(self, log_file): log_file (str): name prefix of the log files to check. """ self.log.info('----Starting main IOR load----') + self.display_stats() # Fill 75% of current SCM free space. Aggregation is Enabled so NVMe space will # start to fill up. self.log.info('--Filling 75% of the current SCM free space--') - self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75) - self.log.info(self.pool.pool_percentage_used()) + try: + self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75) + finally: + self.display_stats() # Fill 50% of current SCM free space. Aggregation is Enabled so NVMe space will # continue to fill up. - self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50) - self.log.info(self.pool.pool_percentage_used()) + try: + self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50) + finally: + self.display_stats() # Fill 60% of current SCM free space. This time, NVMe will be Full so data will # not be moved to NVMe and continue to fill up SCM. SCM will be full and this @@ -310,20 +480,15 @@ def run_enospace_foreground(self, log_file): except TestFail: self.log.info('Test is expected to fail because of DER_NOSPACE') else: - self.fail('This test is suppose to FAIL because of DER_NOSPACE' - 'but it Passed') - - # Display the pool statistics - self.pool.get_info() - pool_space = self.pool.info.pi_space - pool_space_metrics = self.get_pool_space_metrics(self.pool.uuid) - self.display_pool_stats(pool_space, pool_space_metrics) + self.fail('This test is suppose to FAIL because of DER_NOSPACE but it Passed') + finally: + self.display_stats() # verify the DER_NO_SPACE error count is expected and no other Error in client log self.verify_enospace_log(log_file) # Check both NVMe and SCM are full. - pool_usage = self.get_pool_usage(pool_space) + pool_usage = self.get_pool_usage(self.pool.info.pi_space) for idx, elt in enumerate(self.media_names): if pool_usage[idx] >= self.pool_usage_min[idx]: continue @@ -417,7 +582,7 @@ def test_enospace_lazy_with_fg(self): log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log)) self.run_enospace_foreground(log_file) self.log_step(f"Delete all containers - enospc_lazy_fg loop {_loop}") - self.delete_all_containers() + self.delete_all_containers(self.pool) self.log_step(f"Wait for aggregation to complete - enospc_lazy_fg loop {_loop}") if not self.pool.check_free_space( expected_scm=f">={int(initial_free_scm * scm_threshold_percent / 100)}", @@ -491,7 +656,7 @@ def test_enospace_time_with_fg(self): log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log)) self.run_enospace_with_bg_job(log_file) self.log_step(f"Delete all containers - enospace_time_with_fg loop {_loop}") - self.delete_all_containers() + self.delete_all_containers(self.pool) self.log_step(f"Wait for aggregation to complete - enospace_time_with_fg loop {_loop}") if not self.pool.check_free_space( expected_scm=f">={int(initial_free_scm * scm_threshold_percent / 100)}", @@ -591,7 +756,7 @@ def test_enospace_no_aggregation(self): self.verify_enospace_log(log_file) # Delete all the containers - self.delete_all_containers() + self.delete_all_containers(self.pool) # Wait for the SCM space to be released. (Usage goes below 60%) scm_released = False diff --git a/src/vos/evt_iter.c b/src/vos/evt_iter.c index ef9e3efa296..fe5ad6b4f48 100644 --- a/src/vos/evt_iter.c +++ b/src/vos/evt_iter.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -407,7 +408,6 @@ int evt_iter_probe(daos_handle_t ih, enum evt_iter_opc opc, const struct evt_rect *rect, const daos_anchor_t *anchor) { - struct vos_iterator *oiter = vos_hdl2iter(ih); struct evt_iterator *iter; struct evt_context *tcx; struct evt_entry_array *enta; @@ -457,8 +457,7 @@ evt_iter_probe(daos_handle_t ih, enum evt_iter_opc opc, rtmp = *rect; } - rc = evt_ent_array_fill(tcx, fopc, vos_iter_intent(oiter), - &iter->it_filter, &rtmp, enta); + rc = evt_ent_array_fill(tcx, fopc, evt_iter_intent(iter), &iter->it_filter, &rtmp, enta); if (rc != 0) D_GOTO(out, rc); @@ -473,7 +472,7 @@ evt_iter_probe(daos_handle_t ih, enum evt_iter_opc opc, iter->it_state = EVT_ITER_READY; iter->it_skip_move = 0; } - out: +out: return rc; } diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index bce30b201a6..b9101cd07e4 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -369,42 +369,13 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, dae->dae_preparing = 0; } - if (err == 0 && unlikely(dth->dth_need_validation && dth->dth_active)) { - /* Aborted by race during the yield for local TX commit. */ - rc = vos_dtx_validation(dth); - switch (rc) { - case DTX_ST_INITED: - case DTX_ST_PREPARED: - case DTX_ST_PREPARING: - /* The DTX has been ever aborted and related resent RPC - * is in processing. Return -DER_AGAIN to make this ULT - * to retry sometime later without dtx_abort(). - */ - err = -DER_AGAIN; - break; - case DTX_ST_ABORTED: - D_ASSERT(dae == NULL); - /* Aborted, return -DER_INPROGRESS for client retry. - * - * Fall through. - */ - case DTX_ST_ABORTING: - err = -DER_INPROGRESS; - break; - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTING: - case DTX_ST_COMMITTABLE: - /* Aborted then prepared/committed by race. - * Return -DER_ALREADY to avoid repeated modification. - */ - dth->dth_already = 1; - err = -DER_ALREADY; - break; - default: - D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", - DP_DTI(&dth->dth_xid), rc); - } - } else if (dae != NULL) { + if (err == 0 && dth->dth_active) { + err = vos_dtx_validation(dth); + if (err != 0) + goto out; + } + + if (dae != NULL) { if (dth->dth_solo) { if (err == 0 && dae->dae_committing && cont->vc_solo_dtx_epoch < dth->dth_epoch) @@ -429,6 +400,7 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, } } +out: if (err != 0) { /* Do not set dth->dth_pinned. Upper layer caller can do that via * vos_dtx_cleanup() when necessary. diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 666b842c89f..07daa2b2104 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -442,9 +442,16 @@ dtx_cmt_ent_update(struct btr_instance *tins, struct btr_record *rec, rec->rec_off = umem_ptr2off(&tins->ti_umm, dce_new); D_FREE(dce_old); } else if (!dce_old->dce_reindex) { - D_ASSERTF(dce_new->dce_reindex, "Repeatedly commit DTX "DF_DTI"\n", - DP_DTI(&DCE_XID(dce_new))); - dce_new->dce_exist = 1; + /* If two client threads (such as non-initialized context after fork) use the same + * DTX ID (by chance), then it is possible to arrive here. But once comes here, we + * have no chance to require related client/application to restart the transaction + * since related RPC may has already completed. + * */ + if (unlikely(dce_new->dce_reindex == 0)) + D_WARN("Commit DTX " DF_DTI " for more than once, maybe reused\n", + DP_DTI(&DCE_XID(dce_new))); + else + dce_new->dce_exist = 1; } return 0; @@ -1484,7 +1491,8 @@ vos_dtx_validation(struct dtx_handle *dth) d_iov_t riov; int rc = 0; - D_ASSERT(dtx_is_valid_handle(dth)); + if (!dtx_is_valid_handle(dth) || dth->dth_need_validation == 0) + return 0; dae = dth->dth_ent; @@ -1500,7 +1508,7 @@ vos_dtx_validation(struct dtx_handle *dth) * (or different) DTX LRU array slot. */ - if (unlikely(dth->dth_aborted)) { + if (dth->dth_aborted) { D_ASSERT(dae == NULL); cont = vos_hdl2cont(dth->dth_coh); D_ASSERT(cont != NULL); @@ -1531,7 +1539,22 @@ vos_dtx_validation(struct dtx_handle *dth) out: dth->dth_need_validation = 0; - return rc; + + /* It is aborted, then resent, prepared and committed by race. Return -DER_ALREADY to avoid + * repeated modification. + */ + if (rc == DTX_ST_COMMITTED || rc == DTX_ST_COMMITTING || rc == DTX_ST_COMMITTABLE) { + dth->dth_already = 1; + return -DER_ALREADY; + } + + /* The DTX has been ever aborted. Return -DER_AGAIN to make related client to retry sometime + * later without triggering dtx_abort(). + */ + if (dth->dth_aborted || rc == DTX_ST_ABORTED || rc == DTX_ST_ABORTING) + return -DER_AGAIN; + + return rc > 0 ? 0 : rc; } /* The caller has started local transaction. */ @@ -1552,40 +1575,10 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, * Check whether someone touched the DTX before we registering modification * for the first time (during the prepare, such as bulk data transferring). */ - if (unlikely(dth->dth_need_validation && !dth->dth_active)) { + if (!dth->dth_active) { rc = vos_dtx_validation(dth); - switch (rc) { - case DTX_ST_INITED: - if (!dth->dth_aborted) - break; - /* Fall through */ - case DTX_ST_PREPARED: - case DTX_ST_PREPARING: - /* The DTX has been ever aborted and related resent RPC - * is in processing. Return -DER_AGAIN to make this ULT - * to retry sometime later without dtx_abort(). - */ - D_GOTO(out, rc = -DER_AGAIN); - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTING: - case DTX_ST_COMMITTABLE: - /* Aborted then prepared/committed by race. - * Return -DER_ALREADY to avoid repeated modification. - */ - dth->dth_already = 1; - D_GOTO(out, rc = -DER_ALREADY); - case DTX_ST_ABORTED: - D_ASSERT(dth->dth_ent == NULL); - /* Aborted, return -DER_INPROGRESS for client retry. - * - * Fall through. - */ - case DTX_ST_ABORTING: - D_GOTO(out, rc = -DER_INPROGRESS); - default: - D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", - DP_DTI(&dth->dth_xid), rc); - } + if (rc != 0) + goto out; } dae = dth->dth_ent; diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index a0d8ab427f7..3c19a8ae131 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -23,7 +23,7 @@ Name: daos Version: 2.6.4 -Release: 1%{?relval}%{?dist} +Release: 2%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -628,6 +628,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Tue Aug 05 2025 Dalton Bohning 2.6.4-2 +- Second release candidate for 2.6.4 + * Tue Jun 17 2025 Phillip Henderson 2.6.4-1 - First release candidate for 2.6.4 diff --git a/utils/rpms/packaging/Dockerfile.mockbuild b/utils/rpms/packaging/Dockerfile.mockbuild index 9b3b48fbbf5..96621daabc3 100644 --- a/utils/rpms/packaging/Dockerfile.mockbuild +++ b/utils/rpms/packaging/Dockerfile.mockbuild @@ -30,7 +30,7 @@ RUN chmod +x /tmp/repo-helper.sh && \ # This needs to be moved to a shell script like above in the future to # properly only remove the proxy variables only when they need to be removed RUN dnf -y install mock make \ - rpm-build createrepo rpmlint redhat-lsb-core git \ + rpm-build createrepo rpmlint git \ python-srpm-macros rpmdevtools && \ dnf -y clean all diff --git a/utils/scripts/helpers/repo-helper-fedora.sh b/utils/scripts/helpers/repo-helper-fedora.sh index 1f3de8cda8a..cb791907381 100644 --- a/utils/scripts/helpers/repo-helper-fedora.sh +++ b/utils/scripts/helpers/repo-helper-fedora.sh @@ -64,7 +64,7 @@ if [ -n "$REPO_FILE_URL" ]; then pushd /etc/yum.repos.d/ curl -k --noproxy '*' -sSf \ -o "daos_ci-fedora${archive}-${REPOSITORY_NAME}.repo" \ - "{$REPO_FILE_URL}daos_ci-fedora${archive}-${REPOSITORY_NAME}.repo" + "${REPO_FILE_URL}daos_ci-fedora${archive}-${REPOSITORY_NAME}.repo" disable_repos /etc/yum.repos.d/ popd fi