diff --git a/conftest.py b/conftest.py index e4d9f5ef7..3931e6077 100644 --- a/conftest.py +++ b/conftest.py @@ -75,6 +75,13 @@ def pytest_addoption(parser): "4KiB blocksize to be formatted and used in storage tests. " "Set it to 'auto' to let the fixtures auto-detect available disks." ) + parser.addoption( + "--expansion-sr-disk", + action="append", + default=[], + help="Name of an available disk (sdc) or partition device (sdc2) to be formatted and used in storage tests. " + "Set it to 'auto' to let the fixtures auto-detect available disks." + ) def pytest_configure(config): global_config.ignore_ssh_banner = config.getoption('--ignore-ssh-banner') diff --git a/lib/host.py b/lib/host.py index 29ffd9b07..e5da2defb 100644 --- a/lib/host.py +++ b/lib/host.py @@ -516,7 +516,22 @@ def disks(self): disks.sort() return disks - def disk_is_available(self, disk): + def raw_disk_is_available(self, disk: str) -> bool: + """ + Check if a raw disk (without any identifiable filesystem or partition label) is available. + It suggests the disk is "raw" and likely unformatted thus available. + """ + return self.ssh_with_result(['blkid', '/dev/' + disk]).returncode == 2 + + def disk_is_available(self, disk: str) -> bool: + """ + Check if a disk is unmounted and appears available for use. + It may or may not contain identifiable filesystem or partition label. + If there are no mountpoints, it is assumed that the disk is not in use. + + Warn: This function may misclassify LVM_member disks (e.g. in XOSTOR, RAID, ZFS) as "available". + Such disks may not have mountpoints but still be in use. + """ return len(self.ssh(['lsblk', '-n', '-o', 'MOUNTPOINT', '/dev/' + disk]).strip()) == 0 def available_disks(self, blocksize=512): diff --git a/tests/storage/linstor/conftest.py b/tests/storage/linstor/conftest.py index afb09da4d..dcd4e1e84 100644 --- a/tests/storage/linstor/conftest.py +++ b/tests/storage/linstor/conftest.py @@ -37,9 +37,10 @@ def lvm_disks(host, sr_disks_for_all_hosts, provisioning_type): yield devices for host in hosts: + devices = host.ssh('vgs ' + GROUP_NAME + ' -o pv_name --no-headings').split("\n") host.ssh(['vgremove', '-f', GROUP_NAME]) for device in devices: - host.ssh(['pvremove', device]) + host.ssh(['pvremove', '-ff', '-y', device.strip()]) @pytest.fixture(scope="package") def storage_pool_name(provisioning_type): @@ -108,3 +109,120 @@ def vm_on_linstor_sr(host, linstor_sr, vm_ref): yield vm logging.info("<< Destroy VM") vm.destroy(verify=True) + +@pytest.fixture(scope='module') +def prepare_linstor_packages(hostB1): + if not hostB1.is_package_installed(LINSTOR_PACKAGE): + logging.info("Installing %s on host %s", LINSTOR_PACKAGE, hostB1) + hostB1.yum_install([LINSTOR_RELEASE_PACKAGE]) + hostB1.yum_install([LINSTOR_PACKAGE], enablerepo="xcp-ng-linstor-testing") + # Needed because the linstor driver is not in the xapi sm-plugins list + # before installing the LINSTOR packages. + hostB1.ssh(["systemctl", "restart", "multipathd"]) + hostB1.restart_toolstack(verify=True) + yield + hostB1.yum_remove([LINSTOR_PACKAGE]) # Package cleanup + +@pytest.fixture(scope='module') +def setup_lvm_on_host(hostB1): + # Ensure that the host has disks available to use, we do not care about disks symmetry across pool + # We need the disk to be "raw" (non LVM_member etc) to use + disks = [d for d in hostB1.available_disks() if hostB1.raw_disk_is_available(d)] + assert disks, "hostB1 requires at least one raw disk" + devices = [f"/dev/{d}" for d in disks] + + for disk in devices: + logging.info("Found Disk %s", disk) + hostB1.ssh(['pvcreate', disk]) + hostB1.ssh(['vgcreate', GROUP_NAME] + devices) + + yield "linstor_group", devices + +@pytest.fixture(scope='module') +def join_host_to_pool(host, hostB1): + assert len(hostB1.pool.hosts) == 1, "This test requires second host to be a single host" + original_pool = hostB1.pool + logging.info("Joining host %s to pool %s", hostB1, host) + hostB1.join_pool(host.pool) + yield + host.pool.eject_host(hostB1) + hostB1.pool = original_pool + +@pytest.fixture(scope='module') +def vm_with_reboot_check(vm_on_linstor_sr): + vm = vm_on_linstor_sr + vm.start() + vm.wait_for_os_booted() + yield vm + vm.shutdown(verify=True) + # Ensure VM is able to start and shutdown on modified SR + vm.start() + vm.wait_for_os_booted() + vm.shutdown(verify=True) + +@pytest.fixture(scope='module') +def evacuate_host_and_prepare_removal(host, hostA2, vm_with_reboot_check): + assert len(host.pool.hosts) >= 3, "This test requires Pool to have more than 3 hosts" + + vm = vm_with_reboot_check + try: + host.ssh(f'xe host-evacuate uuid={hostA2.uuid}') + except Exception as e: + logging.warning("Host evacuation failed: %s", e) + if "lacks the feature" in getattr(e, "stdout", ""): + vm.shutdown(verify=True, force_if_fails=True) + host.ssh(f'xe host-evacuate uuid={hostA2.uuid}') + available_hosts = [h.uuid for h in host.pool.hosts if h.uuid != hostA2.uuid] + if available_hosts: + vm.start(on=available_hosts[0]) + yield + +@pytest.fixture(scope='module') +def remove_host_from_linstor(host, hostA2, linstor_sr, evacuate_host_and_prepare_removal): + import time + # Select a host that is not running the LINSTOR controller (port 3370) + linstor_controller_host = None + for h in host.pool.hosts: + if h.ssh_with_result(["ss -tuln | grep :3370"]).returncode == 0: + linstor_controller_host = h + break + + # If the controller is running on the host to be ejected (hostA2), stop the services first + if linstor_controller_host and linstor_controller_host.uuid == hostA2.uuid: + logging.info("Ejecting host is running LINSTOR controller, stopping services first.") + hostA2.ssh("systemctl stop linstor-controller.service") + hostA2.ssh("systemctl stop drbd-reactor.service") + hostA2.ssh("systemctl stop drbd-graceful-shutdown.service") + time.sleep(30) # Give time for services to stop + + ejecting_host = hostA2.xe('host-param-get', {'uuid': hostA2.uuid, 'param-name': 'name-label'}) + controller_option = "--controllers=" + ",".join([m.hostname_or_ip for m in host.pool.hosts]) + + hostA2.ssh("systemctl stop linstor-satellite.service") + + pbd = host.xe('pbd-list', {'sr-uuid': linstor_sr.uuid, 'host-uuid': hostA2.uuid}, minimal=True) + host.xe('pbd-unplug', {'uuid': pbd}) + + logging.info(host.ssh_with_result(["linstor", controller_option, "node", "delete", ejecting_host]).stdout) + host.pool.eject_host(hostA2) + + yield + + logging.info("Rejoining hostA2 to the pool after test") + hostA2.join_pool(host.pool) + # We dont want linstor services to be running on a deleted node + hostA2.ssh("systemctl stop linstor-satellite.service") + hostA2.ssh("systemctl stop drbd-graceful-shutdown.service") + # TODO: Package list is not retained in teardown + # hostA2.saved_packages_list = hostA2.packages() + # hostA2.saved_rollback_id = hostA2.get_last_yum_history_tid() + +@pytest.fixture(scope='module') +def get_sr_size(linstor_sr): + sr = linstor_sr + sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + logging.info("SR Size: %s", sr_size) + yield + new_sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + logging.info("New SR Size vs Old SR Size: %s vs %s", new_sr_size, sr_size) + assert new_sr_size != sr_size, "SR size did not change" diff --git a/tests/storage/linstor/test_linstor_sr.py b/tests/storage/linstor/test_linstor_sr.py index 7dc6f4597..e64628e9b 100644 --- a/tests/storage/linstor/test_linstor_sr.py +++ b/tests/storage/linstor/test_linstor_sr.py @@ -2,7 +2,7 @@ import pytest import time -from .conftest import LINSTOR_PACKAGE +from .conftest import GROUP_NAME, LINSTOR_PACKAGE from lib.commands import SSHCommandFailed from lib.common import wait_for, vm_image from tests.storage import vdi_is_open @@ -86,6 +86,158 @@ def test_snapshot(self, vm_on_linstor_sr): finally: vm.shutdown(verify=True) + @pytest.mark.small_vm + def test_linstor_sr_expand_disk(self, linstor_sr, provisioning_type, storage_pool_name, + pytestconfig, vm_with_reboot_check): + """ + This test demonstrates online expansion of a LINSTOR SR while a VM is actively running on it. + + It identifies hosts within the same pool, detects free raw disks, and expands the LVM to grow the SR. + A VM is started before the expansion, and its functionality is verified through a shutdown and restart + after the expansion completes successfully. + """ + sr = linstor_sr + sr_size = sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'}) + + resized = _expand_lvm_on_hosts(sr, provisioning_type, storage_pool_name, pytestconfig) + + # Need to ensure that linstor is healthy/up-to-date before moving ahead. + time.sleep(30) # Wait time for Linstor node communications to restore. + sr.scan() + new_sr_size = sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'}) + assert int(new_sr_size) > int(sr_size) and resized is True, \ + f"Expected SR size to increase but got old size: {sr_size}, new size: {new_sr_size}" + logging.info("SR expansion completed") + + @pytest.mark.small_vm + def test_linstor_sr_expand_host(self, linstor_sr, vm_with_reboot_check, prepare_linstor_packages, + join_host_to_pool, setup_lvm_on_host, host, hostB1, storage_pool_name, + provisioning_type): + """ + This test validates expansion of a LINSTOR SR by dynamically adding a new host with local storage to the pool. + A VM is started on the SR before expansion begins to ensure the SR is in active use during the process. + + It performs the following steps: + - Installs LINSTOR packages on the new host (if missing). + - Detects and prepares raw disks using LVM commands. + - Joins the host (hostB1) to the existing pool and registers it with LINSTOR as a node. + - Creates a new LINSTOR storage pool on the added host (LVM or LVM-thin, based on provisioning type). + - Confirms SR expansion by verifying increased physical size. + - Ensures SR functionality by rebooting the VM running on the SR. + + Finally, the test cleans up by deleting the LINSTOR node, ejecting the host from the pool, + and removing packages and LVM metadata. + """ + sr = linstor_sr + sr_size = sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'}) + resized = False + + # TODO: This section could be moved into a separate fixture for modularity. + # However, capturing the SR size before expansion is critical to the test logic, + # so it's intentionally kept inline to preserve control over the measurement point. + + sr_group_name = "xcp-sr-" + storage_pool_name.replace("/", "_") + hostname = hostB1.xe('host-param-get', {'uuid': hostB1.uuid, 'param-name': 'name-label'}) + controller_option = "--controllers=" + ",".join([m.hostname_or_ip for m in host.pool.hosts]) + + logging.info("Current list of linstor nodes:") + logging.info(host.ssh_with_result(["linstor", controller_option, "node", "list"]).stdout) + + logging.info("Creating linstor node") + host.ssh(["linstor", controller_option, "node", "create", "--node-type", "combined", + "--communication-type", "plain", hostname, hostB1.hostname_or_ip]) + hostB1.ssh(['systemctl', 'restart', 'linstor-satellite.service']) + time.sleep(45) + + logging.info("New list of linstor nodes:") + logging.info(host.ssh_with_result(["linstor", controller_option, "node", "list"]).stdout) + logging.info("Expanding with linstor node") + + if provisioning_type == "thin": + hostB1.ssh(['lvcreate', '-l', '+100%FREE', '-T', storage_pool_name]) + host.ssh(["linstor", controller_option, "storage-pool", "create", "lvmthin", + hostname, sr_group_name, storage_pool_name]) + else: + host.ssh(["linstor", controller_option, "storage-pool", "create", "lvm", + hostname, sr_group_name, storage_pool_name]) + + sr.scan() + resized = True + new_sr_size = sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'}) + assert int(new_sr_size) > int(sr_size) and resized is True, \ + f"Expected SR size to increase but got old size: {sr_size}, new size: {new_sr_size}" + logging.info("SR expansion completed from size %s to %s", sr_size, new_sr_size) + + # Cleanup + host.ssh(["linstor", controller_option, "node", "delete", hostname]) + + @pytest.mark.small_vm + def test_linstor_sr_reduce_disk(self, linstor_sr, vm_with_reboot_check, provisioning_type): + """ + Identify hosts within the same pool, detect used disks, modify LVM, and rescan LINSTOR SR. + """ + if provisioning_type == "thin": + logging.info(f"* SR reductoin by removing device is not supported for {provisioning_type} type *") + return + sr = linstor_sr + sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + resized = False + + for h in sr.pool.hosts: + logging.info("Working on %s", h.hostname_or_ip) + devices = h.ssh('vgs ' + GROUP_NAME + ' -o pv_name --no-headings').split("\n") + assert len(devices) > 1, "This test requires {GROUP_NAME} to have more than 1 disk or parition" + eject_device = devices[-1].strip() + logging.info("Attempting to remove device: %s", eject_device) + try: + h.ssh(['pvmove', eject_device]) # Choosing last device from list, assuming its least filled + h.ssh(['vgreduce', GROUP_NAME, eject_device]) + h.ssh(['pvremove', eject_device]) + except SSHCommandFailed as e: + if "No data to move for" in e.stdout: + h.ssh(['vgreduce', GROUP_NAME, eject_device]) + h.ssh(['pvremove', eject_device]) + else: + pytest.fail("Failed to empty device") + h.ssh('systemctl restart linstor-satellite.service') + resized = True + + # Need to ensure that linstor is healthy/up-to-date before moving ahead. + time.sleep(30) # Wait time for Linstor node communications to restore after service restart. + + sr.scan() + + new_sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + assert new_sr_size < sr_size and resized, \ + f"Expected SR size to decrease but got old size: {sr_size}, new size: {new_sr_size}" + logging.info("SR reduction by removing disk is completed from %s to %s", sr_size, new_sr_size) + + @pytest.mark.small_vm + def test_linstor_sr_reduce_host(self, linstor_sr, get_sr_size, vm_with_reboot_check, host, hostA2, + remove_host_from_linstor): + """ + Remove non master host from the same pool Linstor SR. + Do we measure the time taken by system to rebalance after host removal? + Should the host be graceful empty or force removal? + """ + sr = linstor_sr + sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + sr_size = 886189670400 + resized = False + + # Restart satellite services for clean state. This can be optional. + for h in host.pool.hosts: + h.ssh(['systemctl', 'restart', 'linstor-satellite.service']) + + time.sleep(30) # Wait till all services become normal + + resized = True + sr.scan() + new_sr_size = int(sr.pool.master.xe('sr-param-get', {'uuid': sr.uuid, 'param-name': 'physical-size'})) + assert new_sr_size < sr_size and resized, \ + f"Expected SR size to decrease but got old size: {sr_size}, new size: {new_sr_size}" + logging.info("SR reduction by removing host is completed from %s to %s", sr_size, new_sr_size) + # *** tests with reboots (longer tests). @pytest.mark.reboot @@ -133,6 +285,40 @@ def test_linstor_missing(self, linstor_sr, host): # *** End of tests with reboots +def _expand_lvm_on_hosts(sr, provisioning_type, storage_pool_name, pytestconfig): + from lib.commands import SSHCommandFailed + resized = False + for h in sr.pool.hosts: + logging.info(f"Checking for available disks on host: {h.hostname_or_ip}") + available_disks = [d for d in h.available_disks() if h.raw_disk_is_available(d)] + + disks = [] + expansion_sr_disk = pytestconfig.getoption("expansion_sr_disk") + if expansion_sr_disk: + assert len(expansion_sr_disk) == 1, "Only one --expansion-sr-disk should be provided" + if expansion_sr_disk[0] == "auto": + disks = available_disks + else: + assert expansion_sr_disk[0] in available_disks, "The specified expansion disk is unavailable" + disks = expansion_sr_disk + else: + disks = available_disks + + for disk in disks: + device = f"/dev/{disk}" + try: + h.ssh(['pvcreate', device]) + h.ssh(['vgextend', GROUP_NAME, device]) + if provisioning_type == "thin": + h.ssh(['lvextend', '-l', '+100%FREE', storage_pool_name]) + else: + h.ssh(['systemctl', 'restart', 'linstor-satellite.service']) + resized = True + logging.info("LVM extended on host %s using device %s", h.hostname_or_ip, device) + except SSHCommandFailed as e: + raise RuntimeError(f"Disk expansion failed on {h.hostname_or_ip}: {e}") + return resized + # --- Test diskless resources -------------------------------------------------- def _get_diskful_hosts(host, controller_option, volume_name):