diff --git a/abi/include/abi/ipc/interfaces.h b/abi/include/abi/ipc/interfaces.h index fb72d49da4..db40e4e1a6 100644 --- a/abi/include/abi/ipc/interfaces.h +++ b/abi/include/abi/ipc/interfaces.h @@ -205,7 +205,9 @@ typedef enum { INTERFACE_SYSTEM = FOURCC_COMPACT('s', 's', 't', 'm') | IFACE_EXCHANGE_SERIALIZE, INTERFACE_SYSTEM_CB = - FOURCC_COMPACT('s', 's', 't', 'm') | IFACE_EXCHANGE_SERIALIZE | IFACE_MOD_CALLBACK + FOURCC_COMPACT('s', 's', 't', 'm') | IFACE_EXCHANGE_SERIALIZE | IFACE_MOD_CALLBACK, + INTERFACE_HR = + FOURCC_COMPACT('h', 'r', ' ', ' ') | IFACE_EXCHANGE_SERIALIZE } iface_t; #endif diff --git a/uspace/app/bdwrite/bdwrite.c b/uspace/app/bdwrite/bdwrite.c new file mode 100644 index 0000000000..a2d523bec1 --- /dev/null +++ b/uspace/app/bdwrite/bdwrite.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup bdwrite + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void usage(void); + +static const char usage_str[] = + "Usage: bdwrite -o -c \n" + "\n" + " Write cyclic blocks to block device.\n"; + +static struct option const long_options[] = { + { 0, 0, 0, 0 } +}; + +static void usage(void) +{ + printf("%s", usage_str); +} + +int main(int argc, char **argv) +{ + errno_t rc; + size_t bsize; + int c; + char *name = NULL; + size_t blkcnt = 0, off = 0; + service_id_t dev; + + if (argc != 6) { + goto bad; + } + + name = argv[1]; + + c = 0; + optreset = 1; + optind = 0; + + while (c != -1) { + c = getopt_long(argc, argv, "o:c:", long_options, NULL); + switch (c) { + case 'o': + off = strtol(optarg, NULL, 10); + break; + case 'c': + blkcnt = strtol(optarg, NULL, 10); + break; + } + } + + rc = loc_service_get_id(name, &dev, 0); + if (rc != EOK) { + printf("bdwrite: error resolving device \"%s\"\n", name); + return 1; + } + rc = block_init(dev); + if (rc != EOK) { + printf("bdwrite: error initializing block device \"%s\"\n", name); + return 1; + } + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) { + printf("bdwrite: error getting block size of \"%s\"\n", name); + block_fini(dev); + return 1; + } + + uint64_t to_alloc = min(DATA_XFER_LIMIT, bsize * blkcnt); + uint8_t *buf = malloc(to_alloc); + if (buf == NULL) { + rc = ENOMEM; + goto end; + } + + stopwatch_t stopwatch; + stopwatch_init(&stopwatch); + stopwatch_start(&stopwatch); + + uint64_t left = blkcnt; + while (left != 0) { + uint64_t blks_to_write = min(to_alloc / bsize, left); + uint8_t *ptr = buf; + for (size_t i = 0; i < blks_to_write; i++) { + /* memset(ptr, (i + 1) % 0x100, bsize); */ + memset(ptr, 'A' + (i % 26), bsize); + ptr += bsize; + } + rc = block_write_direct(dev, off, blks_to_write, buf); + if (rc != EOK) { + printf("bdwrite: error writing to \"%s\"\n", name); + goto end; + } + left -= blks_to_write; + off += blks_to_write; + } +end: + stopwatch_stop(&stopwatch); + nsec_t t = stopwatch_get_nanos(&stopwatch); + printf("Elapsed time:\n"); + printf("\t%llu ms\n", NSEC2MSEC(t)); + printf("\t%lf s\n", NSEC2SEC((double)t)); + free(buf); + block_fini(dev); + return rc; +bad: + usage(); + return 0; +} + +/** @} + */ diff --git a/uspace/app/bdwrite/meson.build b/uspace/app/bdwrite/meson.build new file mode 100644 index 0000000000..faa14ed30e --- /dev/null +++ b/uspace/app/bdwrite/meson.build @@ -0,0 +1,30 @@ +# +# Copyright (c) 2024 Miroslav Cimerman +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +deps = [ 'block' ] +src = files('bdwrite.c') diff --git a/uspace/app/hrctl/create_file_bd_disks.bdsh b/uspace/app/hrctl/create_file_bd_disks.bdsh new file mode 100644 index 0000000000..cb2d84dcfe --- /dev/null +++ b/uspace/app/hrctl/create_file_bd_disks.bdsh @@ -0,0 +1,9 @@ +mkfile -s 100M /tmp/file1 +mkfile -s 100M /tmp/file2 +mkfile -s 100M /tmp/file3 +mkfile -s 100M /tmp/file4 + +/srv/bd/file_bd -b 512 /tmp/file1 disk1 +/srv/bd/file_bd -b 512 /tmp/file2 disk2 +/srv/bd/file_bd -b 512 /tmp/file3 disk3 +/srv/bd/file_bd -b 512 /tmp/file4 disk4 diff --git a/uspace/app/hrctl/hrctl.c b/uspace/app/hrctl/hrctl.c new file mode 100644 index 0000000000..215f453798 --- /dev/null +++ b/uspace/app/hrctl/hrctl.c @@ -0,0 +1,862 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hrctl + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* #define HRCTL_SAMPLE_CONFIG_PATH "/cfg/sample_hr_config.sif" */ + +#define NAME "hrctl" + +static void usage(void); +static errno_t fill_config_devs(int, char **, hr_config_t *); +static errno_t get_vol_configs_from_sif(const char *, hr_config_t **, size_t *); +static int create_from_config(hr_t *, const char *, uint8_t); +static int create_from_argv(hr_t *, int, char **, uint8_t); +static int handle_create(hr_t *, int, char **); +static int assemble_from_config(hr_t *, const char *, uint8_t); +static int assemble_from_argv(hr_t *, int, char **, uint8_t); +static int handle_assemble(hr_t *, int, char **); +static int handle_disassemble(hr_t *, int, char **); +static int handle_modify(hr_t *, int, char **); +static errno_t print_vol_info(hr_vol_info_t *); +static int handle_state(hr_t *, int, char **); + +static const char usage_str[] = + NAME ": HelenOS RAID configuration and management utility.\n" + "Usage: " NAME " [OPTION]...\n" + "\n" + "Options:\n" + " -h, --help Display this message and exit.\n" + "\n" + " -c, --create [--no-meta] [--read-only] Create a volume, options:\n" + " name {-l , --level level} device... manual device specification, or\n" + " -f configuration.sif create from configuration file.\n" + "\n" + " -a, --assemble Assemble volume(s), options:\n" + " [[--read-only] device...] manual device specification, or\n" + " [[--read-only] -f configuration.sif] assemble from configuration file, or\n" + " no option is automatic assembly.\n" + "\n" + " -d, --disassemble Deactivate/disassemble, options:\n" + " [volume] specific volume, or\n" + " all volumes with no specified option.\n" + "\n" + " -m, --modify volume Modify a volume, options:\n" + " -f, --fail index fail an extent (DANGEROUS), or\n" + " -h, --hotspare device add hotspare.\n" + "\n" + " -s, --state [volume] Display state of active volume(s).\n" + "\n" + "level can be one of:\n" + " 0 | stripe | striping |\n" + " 1 | mirror | mirroring |\n" + " 4 | parity_dedicated |\n" + " 5 | parity | parity_distributed\n" + "\n" + "Example usage:\n" + "\t\thrctl --create hr0 --level 5 disk1 disk2 disk3\n" + "\t\thrctl -c hr0 -l 5 disk1 disk2 disk3\n" + "\t\thrctl -c -f cfg.sif\n" + "\t\thrctl --assemble disk1 disk2 disk3\n" + "\t\thrctl -a\n" + "\t\thrctl -d hr0\n" + "\t\thrctl -d\n" + "\t\thrctl --modify hr0 --fail 0\n" + "\t\thrctl --modify hr0 --hotspare disk4\n" + "\t\thrctl -s\n" + "\n" + "Notes:\n" + " Add --no-meta after --create to disable storing on-disk metadata.\n" + " Simulating an extent failure with -m volume -f index is dangerous. It marks\n" + " metadata as dirty in other healthy extents, and zeroes out the superblock\n" + " on the specified extent.\n" + " Nested levels have to be created manually, or from a config file, and need to\n" + " be specified as separate volumes.\n" + "\n" + "Limitations:\n" + "\t- volume name must be shorter than 32 characters\n" + "\t- no explicit nested levels volume support\n" + "\t- automatic assembly and disassembly on nested volumes is undefine!\n"; + +int main(int argc, char **argv) +{ + int rc = EXIT_SUCCESS; + hr_t *hr = NULL; + + if (argc < 2) { + rc = EXIT_FAILURE; + goto end; + } + + if (argc > 1 && + ((str_cmp(argv[1], "-h") == 0) || + str_cmp(argv[1], "--help") == 0)) { + usage(); + return EXIT_SUCCESS; + } + + rc = hr_sess_init(&hr); + if (rc != EOK) { + printf(NAME ": hr server session init failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + + optreset = 1; + optind = 0; + + struct option const top_level_opts[] = { + { "help", no_argument, 0, 'h' }, + { "create", no_argument, 0, 'c' }, + { "assemble", no_argument, 0, 'a' }, + { "disassemble", no_argument, 0, 'd' }, + { "modify", no_argument, 0, 'm' }, + { "state", no_argument, 0, 's' }, + { 0, 0, 0, 0 } + }; + + int c = getopt_long(argc, argv, "hcadms", top_level_opts, NULL); + switch (c) { + case 'h': + usage(); + goto end; + case 'c': + rc = handle_create(hr, argc, argv); + goto end; + case 'a': + rc = handle_assemble(hr, argc, argv); + goto end; + case 'd': + rc = handle_disassemble(hr, argc, argv); + goto end; + case 'm': + rc = handle_modify(hr, argc, argv); + goto end; + case 's': + rc = handle_state(hr, argc, argv); + goto end; + default: + goto end; + } + +end: + hr_sess_destroy(hr); + + if (rc != EXIT_SUCCESS) + printf(NAME ": use --help to see usage\n"); + + return rc; +} + +static void usage(void) +{ + printf("%s", usage_str); +} + +static errno_t fill_config_devs(int argc, char **argv, hr_config_t *cfg) +{ + errno_t rc; + size_t i; + + for (i = 0; i < HR_MAX_EXTENTS && optind < argc; i++) { + rc = loc_service_get_id(argv[optind], &cfg->devs[i], 0); + if (rc == ENOENT) { + printf(NAME ": device \"%s\" not found, aborting\n", + argv[optind]); + return ENOENT; + } else if (rc != EOK) { + printf(NAME ": error resolving device \"%s\", aborting\n", + argv[optind]); + return EINVAL; + } + optind++; + } + + if (optind < argc) { + printf(NAME ": too many devices specified, max = %u\n", + HR_MAX_EXTENTS); + return ELIMIT; + } + + cfg->dev_no = i; + + return EOK; +} + +static errno_t get_vol_configs_from_sif(const char *path, hr_config_t **rcfgs, + size_t *rcount) +{ + errno_t rc; + sif_doc_t *doc = NULL; + sif_node_t *hrconfig_node; + sif_node_t *root_node; + sif_node_t *volume_node; + sif_node_t *nextent; + const char *ntype; + const char *devname; + const char *level_str; + const char *extent_devname; + hr_config_t *vol_configs = NULL; + + rc = sif_load(path, &doc); + if (rc != EOK) + goto error; + + root_node = sif_get_root(doc); + + hrconfig_node = sif_node_first_child(root_node); + ntype = sif_node_get_type(hrconfig_node); + if (str_cmp(ntype, "hrconfig") != 0) { + rc = EINVAL; + goto error; + } + + size_t vol_count = 0; + volume_node = sif_node_first_child(hrconfig_node); + while (volume_node) { + ntype = sif_node_get_type(volume_node); + if (str_cmp(ntype, "volume") != 0) { + rc = EINVAL; + goto error; + } + vol_configs = realloc(vol_configs, + (vol_count + 1) * sizeof(hr_config_t)); + if (vol_configs == NULL) { + rc = ENOMEM; + goto error; + } + + hr_config_t *cfg = vol_configs + vol_count; + + devname = sif_node_get_attr(volume_node, "devname"); + if (devname == NULL) { + rc = EINVAL; + goto error; + } + str_cpy(cfg->devname, sizeof(cfg->devname), devname); + + level_str = sif_node_get_attr(volume_node, "level"); + if (level_str == NULL) + cfg->level = HR_LVL_UNKNOWN; + else + cfg->level = strtol(level_str, NULL, 10); + + nextent = sif_node_first_child(volume_node); + size_t i = 0; + while (nextent && i < HR_MAX_EXTENTS) { + ntype = sif_node_get_type(nextent); + if (str_cmp(ntype, "extent") != 0) { + rc = EINVAL; + goto error; + } + + extent_devname = sif_node_get_attr(nextent, "devname"); + if (extent_devname == NULL) { + rc = EINVAL; + goto error; + } + + rc = loc_service_get_id(extent_devname, &cfg->devs[i], 0); + if (rc == ENOENT) { + printf(NAME ": no device \"%s\", marking as missing\n", + extent_devname); + cfg->devs[i] = 0; + rc = EOK; + } else if (rc != EOK) { + printf(NAME ": error resolving device \"%s\", aborting\n", + extent_devname); + goto error; + } + + nextent = sif_node_next_child(nextent); + i++; + } + + if (i > HR_MAX_EXTENTS) { + printf(NAME ": too many devices specified in volume \"%s\", " + "skipping\n", devname); + memset(&vol_configs[vol_count], 0, sizeof(hr_config_t)); + } else { + cfg->dev_no = i; + vol_count++; + } + + volume_node = sif_node_next_child(volume_node); + } + + if (rc == EOK) { + if (rcount) + *rcount = vol_count; + if (rcfgs) + *rcfgs = vol_configs; + } +error: + if (doc != NULL) + sif_delete(doc); + if (rc != EOK) { + if (vol_configs) + free(vol_configs); + } + return rc; +} + +static int create_from_config(hr_t *hr, const char *config_path, + uint8_t vol_flags) +{ + hr_config_t *vol_configs = NULL; + size_t vol_count = 0; + errno_t rc = get_vol_configs_from_sif(config_path, &vol_configs, + &vol_count); + if (rc != EOK) { + printf(NAME ": config parsing failed\n"); + return EXIT_FAILURE; + } + + for (size_t i = 0; i < vol_count; i++) + vol_configs[i].vol_flags |= vol_flags; + + for (size_t i = 0; i < vol_count; i++) { + rc = hr_create(hr, &vol_configs[i]); + if (rc != EOK) { + printf(NAME ": creation of volume \"%s\" failed: %s, " + "but continuing\n", + vol_configs[i].devname, str_error(rc)); + } else { + printf(NAME ": volume \"%s\" successfully created\n", + vol_configs[i].devname); + } + } + + free(vol_configs); + return EXIT_SUCCESS; +} + +static int create_from_argv(hr_t *hr, int argc, char **argv, uint8_t vol_flags) +{ + /* we need name + --level + arg + at least one extent */ + if (optind + 3 >= argc) { + printf(NAME ": not enough arguments\n"); + return EXIT_FAILURE; + } + + hr_config_t *vol_config = calloc(1, sizeof(hr_config_t)); + if (vol_config == NULL) { + printf(NAME ": not enough memory\n"); + return EXIT_FAILURE; + } + + vol_config->vol_flags |= vol_flags; + + const char *name = argv[optind++]; + if (str_size(name) >= HR_DEVNAME_LEN) { + printf(NAME ": devname must be less then 32 bytes.\n"); + goto error; + } + + str_cpy(vol_config->devname, HR_DEVNAME_LEN, name); + + const char *level_opt = argv[optind++]; + if (str_cmp(level_opt, "--level") != 0 && + str_cmp(level_opt, "-l") != 0) { + printf(NAME ": unknown option \"%s\"\n", level_opt); + goto error; + } + + const char *level_str = argv[optind++]; + if (str_size(level_str) == 1 && isdigit(level_str[0])) { + vol_config->level = strtol(level_str, NULL, 10); + } else { + if (str_cmp(level_str, "mirror") == 0 || + str_cmp(level_str, "mirroring") == 0) { + vol_config->level = HR_LVL_1; + } else if (str_cmp(level_str, "stripe") == 0 || + str_cmp(level_str, "striping") == 0) { + vol_config->level = HR_LVL_0; + } else if (str_cmp(level_str, "parity") == 0 || + str_cmp(level_str, "parity_distributed") == 0) { + vol_config->level = HR_LVL_5; + } else if (str_cmp(level_str, "parity_dedicated") == 0) { + vol_config->level = HR_LVL_4; + } else { + printf(NAME ": unknown level \"%s\"\n", level_str); + goto error; + } + } + + errno_t rc = fill_config_devs(argc, argv, vol_config); + if (rc != EOK) + goto error; + + rc = hr_create(hr, vol_config); + if (rc != EOK) { + printf(NAME ": creation failed: %s\n", str_error(rc)); + goto error; + } else { + printf(NAME ": volume \"%s\" successfully created\n", + vol_config->devname); + } + + free(vol_config); + return EXIT_SUCCESS; +error: + free(vol_config); + return EXIT_FAILURE; +} + +static bool try_to_get_additional_flags(int argc, char **argv, + uint8_t test_flags, uint8_t *flags) +{ + if (test_flags & HR_VOL_FLAG_NOOP_META) { + if (str_cmp(argv[optind], "--no-meta") == 0) { + *flags |= HR_VOL_FLAG_NOOP_META; + optind++; + return true; + } + } + + if (test_flags & HR_VOL_FLAG_READ_ONLY) { + if (str_cmp(argv[optind], "--read-only") == 0) { + *flags |= HR_VOL_FLAG_READ_ONLY; + optind++; + return true; + } + } + + return false; +} + +static int handle_create(hr_t *hr, int argc, char **argv) +{ + int rc; + uint8_t vflags = 0; + + if (optind >= argc) { + printf(NAME ": no arguments to --create\n"); + return EXIT_FAILURE; + } + + uint8_t test_flags = HR_VOL_FLAG_NOOP_META | HR_VOL_FLAG_READ_ONLY; + while (try_to_get_additional_flags(argc, argv, test_flags, &vflags)) + ; + + if (str_cmp(argv[optind], "-f") == 0) { + optind++; + if (optind >= argc) { + printf(NAME ": not enough arguments\n"); + return EXIT_FAILURE; + } + + const char *config_path = argv[optind++]; + + if (optind < argc) { + printf(NAME ": unexpected arguments\n"); + return EXIT_FAILURE; + } + + rc = create_from_config(hr, config_path, vflags); + } else { + rc = create_from_argv(hr, argc, argv, vflags); + } + + return rc; +} + +static int assemble_from_config(hr_t *hr, const char *config_path, + uint8_t vflags) +{ + hr_config_t *vol_configs = NULL; + size_t vol_count = 0; + errno_t rc = get_vol_configs_from_sif(config_path, &vol_configs, + &vol_count); + if (rc != EOK) { + printf(NAME ": config parsing failed\n"); + return EXIT_FAILURE; + } + + size_t cnt = 0; + for (size_t i = 0; i < vol_count; i++) { + size_t tmpcnt = 0; + vol_configs[i].vol_flags = vflags; + (void)hr_assemble(hr, &vol_configs[i], &tmpcnt); + cnt += tmpcnt; + } + + printf(NAME ": assembled %zu volumes\n", cnt); + + free(vol_configs); + return EXIT_SUCCESS; +} + +static int assemble_from_argv(hr_t *hr, int argc, char **argv, uint8_t vflags) +{ + hr_config_t *vol_config = calloc(1, sizeof(hr_config_t)); + if (vol_config == NULL) { + printf(NAME ": not enough memory\n"); + return ENOMEM; + } + + vol_config->vol_flags = vflags; + + errno_t rc = fill_config_devs(argc, argv, vol_config); + if (rc != EOK) + goto error; + + size_t cnt; + rc = hr_assemble(hr, vol_config, &cnt); + if (rc != EOK) { + printf(NAME ": assmeble failed: %s\n", str_error(rc)); + goto error; + } + + printf("hrctl: assembled %zu volumes\n", cnt); + + free(vol_config); + return EXIT_SUCCESS; +error: + free(vol_config); + return EXIT_FAILURE; +} + +static int handle_assemble(hr_t *hr, int argc, char **argv) +{ + int rc; + + if (optind >= argc) { + size_t cnt; + errno_t rc = hr_auto_assemble(hr, &cnt); + if (rc != EOK) { + /* XXX: here have own error codes */ + printf("hrctl: auto assemble rc: %s\n", str_error(rc)); + return EXIT_FAILURE; + } + + printf(NAME ": auto assembled %zu volumes\n", cnt); + return EXIT_SUCCESS; + } + + uint8_t vflags = 0; + uint8_t test_flags = HR_VOL_FLAG_NOOP_META | HR_VOL_FLAG_READ_ONLY; + while (try_to_get_additional_flags(argc, argv, test_flags, &vflags)) + ; + + if (test_flags & HR_VOL_FLAG_NOOP_META) + printf(NAME ": assembling, --no-meta flag will be ignored\n"); + + if (str_cmp(argv[optind], "-f") == 0) { + if (++optind >= argc) { + printf(NAME ": not enough arguments\n"); + return EXIT_FAILURE; + } + const char *config_path = argv[optind++]; + + if (optind < argc) { + printf(NAME ": unexpected arguments\n"); + return EXIT_FAILURE; + } + + rc = assemble_from_config(hr, config_path, vflags); + } else { + rc = assemble_from_argv(hr, argc, argv, vflags); + } + + return rc; +} + +static int handle_disassemble(hr_t *hr, int argc, char **argv) +{ + if (optind >= argc) { + errno_t rc = hr_stop_all(hr); + if (rc != EOK) { + printf(NAME ": stopping some volumes failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + return EXIT_SUCCESS; + } + + if (optind + 1 < argc) { + printf(NAME ": only 1 device can be manually specified\n"); + return EXIT_FAILURE; + } + + const char *devname = argv[optind++]; + + errno_t rc = hr_stop(hr, devname); + if (rc != EOK) { + printf(NAME ": disassembly of device \"%s\" failed: %s\n", + devname, str_error(rc)); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + +static int handle_modify(hr_t *hr, int argc, char **argv) +{ + if (optind >= argc) { + printf(NAME ": no arguments to --modify\n"); + return EXIT_FAILURE; + } + + const char *volname = argv[optind++]; + + /* at least 1 option and its agument */ + if (optind + 1 >= argc) { + printf(NAME ": not enough arguments\n"); + return EXIT_FAILURE; + } + + if (optind + 2 < argc) { + printf(NAME ": unexpected arguments\n"); + return EXIT_FAILURE; + } + + if (str_cmp(argv[optind], "--fail") == 0 || + str_cmp(argv[optind], "-f") == 0) { + optind++; + unsigned long extent = strtol(argv[optind++], NULL, 10); + errno_t rc = hr_fail_extent(hr, volname, extent); + if (rc != EOK) { + printf(NAME ": failing extent failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + } else if (str_cmp(argv[optind], "--hotspare") == 0 || + str_cmp(argv[optind], "-h") == 0) { + optind++; + errno_t rc = hr_add_hotspare(hr, volname, argv[optind++]); + if (rc != EOK) { + printf(NAME ": adding hotspare failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + } else { + printf(NAME ": unknown argument\n"); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + +static errno_t print_vol_info(hr_vol_info_t *info) +{ + errno_t rc; + size_t i; + hr_extent_t *ext; + const char *devname; + + printf("volume: '%s' (%" PRIun ")\n", info->devname, info->svc_id); + + printf("| metadata type: %s\n", + hr_get_metadata_type_str(info->meta_type)); + + printf("| vflags: "); + for (size_t v = 0; v < HR_VOL_FLAG_COUNT; v++) { + if (info->vflags & (1 << v)) { + printf("%s ", + hr_get_vol_flag_str(info->vflags & (1 << v))); + } + } + printf("\n"); + + printf("| level: %s\n", hr_get_level_str(info->level)); + if (info->layout != HR_LAYOUT_NONE) + printf("| layout: %s\n", + hr_get_layout_str(info->layout)); + + if (info->strip_size > 0) { + if (info->strip_size < 1024) { + printf("| strip size: %" PRIu32 " B\n", + info->strip_size); + } else { + printf("| strip size: %" PRIu32 " KiB\n", + info->strip_size / 1024); + } + } + + printf("| no. of extents: %zu\n", info->extent_no); + printf("|no. of hotspares: %zu\n", info->hotspare_no); + printf("|number of blocks: %" PRIu64 "\n", info->data_blkno); + printf("| block size: %zu B\n", info->bsize); + + capa_spec_t capa; + char *scapa = NULL; + capa_from_blocks(info->data_blkno, info->bsize, &capa); + capa_simplify(&capa); + rc = capa_format(&capa, &scapa); + if (rc != EOK) { + printf(NAME ": failed to format capacity: %s\n", str_error(rc)); + return rc; + } + + printf("| volume capacity: %s\n", scapa); + + free(scapa); + + printf("| state: %s", hr_get_vol_state_str(info->state)); + if (info->state == HR_VOL_REBUILD) { + unsigned int percent = + (info->rebuild_blk * 100) / info->data_blkno; + printf(" (%u%% done)\n", percent); + } else { + printf("\n"); + } + + printf("| extents:\n"); + + for (i = 0; i < info->extent_no; i++) { + ext = &info->extents[i]; + char *tmpname = NULL; + if (ext->state == HR_EXT_MISSING || ext->state == HR_EXT_NONE) { + devname = "MISSING"; + } else { + rc = loc_service_get_name(ext->svc_id, &tmpname); + if (rc != EOK) + devname = "MISSING"; + else + devname = tmpname; + } + printf("| %zu %s\n", i, hr_get_ext_state_str(ext->state)); + printf("| %s\n", devname); + if (tmpname != NULL) + free(tmpname); + } + + if (info->hotspare_no == 0) + return EOK; + + printf("| hotspares:\n"); + for (i = 0; i < info->hotspare_no; i++) { + ext = &info->hotspares[i]; + char *tmpname; + if (ext->state == HR_EXT_MISSING || ext->state == HR_EXT_NONE) { + devname = "MISSING"; + } else { + rc = loc_service_get_name(ext->svc_id, &tmpname); + if (rc != EOK) + devname = "MISSING"; + else + devname = tmpname; + } + printf("| %zu %s\n", i, hr_get_ext_state_str(ext->state)); + printf("| %s\n", devname); + if (tmpname != NULL) + free(tmpname); + } + + return EOK; +} + +static int handle_state(hr_t *hr, int argc, char **argv) +{ + errno_t rc; + size_t cnt; + hr_pair_vol_state_t *pairs = NULL; + char *devname; + + /* print state of all volumes */ + if (optind >= argc) { + rc = hr_get_vol_states(hr, &pairs, &cnt); + if (rc != EOK) { + printf(NAME ": failed getting state of volumes: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + + if (cnt == 0) { + printf(NAME ": no active volumes\n"); + return EXIT_SUCCESS; + } + + for (size_t i = 0; i < cnt; i++) { + service_id_t svc_id = pairs[i].svc_id; + hr_vol_state_t state = pairs[i].state; + rc = loc_service_get_name(svc_id, &devname); + if (rc != EOK) { + printf(NAME ": getting service name failed: " + "%s\n", str_error(rc)); + return EXIT_FAILURE; + } + printf("volume '%s' (%" PRIun ") %s\n", devname, + svc_id, hr_get_vol_state_str(state)); + + free(devname); + } + free(pairs); + + return EXIT_SUCCESS; + } + + /* print volume info of requested volumes */ + while (optind < argc) { + service_id_t svc_id; + devname = argv[optind++]; + rc = loc_service_get_id(devname, &svc_id, 0); + if (rc != EOK) { + printf(NAME ": getting service id of \"%s\" failed: " + "%s\n", devname, str_error(rc)); + return EXIT_FAILURE; + } + + hr_vol_info_t info; + rc = hr_get_vol_info(hr, svc_id, &info); + if (rc != EOK) { + printf(NAME ": getting volume info failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + + rc = print_vol_info(&info); + if (rc != EOK) { + printf(NAME ": volume info printing failed: %s\n", + str_error(rc)); + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} + +/** @} + */ diff --git a/uspace/app/hrctl/meson.build b/uspace/app/hrctl/meson.build new file mode 100644 index 0000000000..5185436c12 --- /dev/null +++ b/uspace/app/hrctl/meson.build @@ -0,0 +1,34 @@ +# +# Copyright (c) 2025 Miroslav Cimerman +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +deps = [ 'device', 'sif' ] +src = files('hrctl.c') + +installed_data += { 'name': 'sample_hrconfig_pci.sif', 'dir': '/cfg' } +installed_data += { 'name': 'sample_hrconfig_file_bd.sif', 'dir': '/cfg' } +installed_data += { 'name': 'create_file_bd_disks.bdsh', 'dir': '/cfg' } diff --git a/uspace/app/hrctl/sample_hrconfig_file_bd.sif b/uspace/app/hrctl/sample_hrconfig_file_bd.sif new file mode 100644 index 0000000000..c8c898a597 --- /dev/null +++ b/uspace/app/hrctl/sample_hrconfig_file_bd.sif @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/uspace/app/hrctl/sample_hrconfig_pci.sif b/uspace/app/hrctl/sample_hrconfig_pci.sif new file mode 100644 index 0000000000..74388e0285 --- /dev/null +++ b/uspace/app/hrctl/sample_hrconfig_pci.sif @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/uspace/app/meson.build b/uspace/app/meson.build index 7505a71473..83bd1e8f41 100644 --- a/uspace/app/meson.build +++ b/uspace/app/meson.build @@ -30,6 +30,7 @@ apps = [ 'aboutos', 'barber', 'bdsh', + 'bdwrite', 'bithenge', 'blkdump', 'calculator', @@ -51,6 +52,7 @@ apps = [ 'gunzip', 'hbench', 'hello', + 'hrctl', 'inet', 'init', 'kill', diff --git a/uspace/lib/c/include/ipc/services.h b/uspace/lib/c/include/ipc/services.h index 34ed1531f1..fcb1ee222d 100644 --- a/uspace/lib/c/include/ipc/services.h +++ b/uspace/lib/c/include/ipc/services.h @@ -57,6 +57,7 @@ typedef enum { #define SERVICE_NAME_DISPCFG "hid/display" #define SERVICE_NAME_DISPLAY "hid/display" #define SERVICE_NAME_WNDMGT "hid/display" +#define SERVICE_NAME_HR "hr" #define SERVICE_NAME_DHCP "net/dhcp" #define SERVICE_NAME_DNSR "net/dnsr" #define SERVICE_NAME_INET "net/inet" diff --git a/uspace/lib/device/include/hr.h b/uspace/lib/device/include/hr.h new file mode 100644 index 0000000000..3bc8bb9389 --- /dev/null +++ b/uspace/lib/device/include/hr.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup libdevice + * @{ + */ +/** + * @file + */ + +#ifndef LIBDEVICE_HR_H +#define LIBDEVICE_HR_H + +#include +#include +#include + +/* for now */ +#define HR_MAX_EXTENTS 4 +#define HR_MAX_HOTSPARES HR_MAX_EXTENTS + +#define HR_DEVNAME_LEN 32 + +typedef enum hr_level { + HR_LVL_0 = 0x00, /* striping, no redundancy */ + HR_LVL_1 = 0x01, /* n-way mirroring */ + HR_LVL_4 = 0x04, /* dedicated parity */ + HR_LVL_5 = 0x05, /* distributed parity */ + HR_LVL_UNKNOWN = 0xFF +} hr_level_t; + +typedef enum hr_layout { + HR_LAYOUT_NONE = 0, + HR_LAYOUT_RAID4_0, /* RAID-4 Non-Rotating Parity 0 */ + HR_LAYOUT_RAID4_N, /* RAID-4 Non-Rotating Parity N */ + HR_LAYOUT_RAID5_0R, /* RAID-5 Rotating Parity 0 with Data Restart */ + HR_LAYOUT_RAID5_NR, /* RAID-5 Rotating Parity N with Data Restart */ + HR_LAYOUT_RAID5_NC /* RAID-5 Rotating Parity N with Data Continuation */ +} hr_layout_t; + +typedef enum hr_vol_state { + HR_VOL_NONE = 0, /* Unknown/None */ + HR_VOL_OPTIMAL, /* optimal */ + HR_VOL_FAULTY, /* unusable */ + HR_VOL_DEGRADED, /* not optimal */ + HR_VOL_REBUILD /* rebuild in progress */ +} hr_vol_state_t; + +typedef enum hr_ext_state { + HR_EXT_NONE = 0, /* unknown/none state */ + HR_EXT_INVALID, /* working but not consistent */ + HR_EXT_ONLINE, /* ok */ + HR_EXT_MISSING, /* offline */ + HR_EXT_FAILED, + HR_EXT_REBUILD, + HR_EXT_HOTSPARE +} hr_ext_state_t; + +typedef enum { + HR_METADATA_NATIVE = 0, + HR_METADATA_GEOM_MIRROR, + HR_METADATA_GEOM_STRIPE, + HR_METADATA_SOFTRAID, + HR_METADATA_MD, + HR_METADATA_NOOP, + HR_METADATA_LAST_PLACEHOLDER +} hr_metadata_type_t; + +#define HR_VOL_FLAG_COUNT 2 +typedef enum hr_vol_flag { + HR_VOL_FLAG_NOOP_META = 0x01, + HR_VOL_FLAG_READ_ONLY = 0x02 +} hr_vol_flag_t; + +typedef struct hr { + async_sess_t *sess; +} hr_t; + +typedef struct hr_config { + char devname[HR_DEVNAME_LEN]; + service_id_t devs[HR_MAX_EXTENTS]; + size_t dev_no; + hr_level_t level; + uint8_t vol_flags; +} hr_config_t; + +typedef struct hr_extent { + service_id_t svc_id; + hr_ext_state_t state; +} hr_extent_t; + +typedef struct hr_pair_vol_state { + service_id_t svc_id; + hr_vol_state_t state; +} hr_pair_vol_state_t; + +typedef struct hr_vol_info { + char devname[HR_DEVNAME_LEN]; + service_id_t svc_id; + hr_level_t level; + hr_extent_t extents[HR_MAX_EXTENTS]; + hr_extent_t hotspares[HR_MAX_HOTSPARES]; + size_t extent_no; + size_t hotspare_no; + uint64_t data_blkno; + uint64_t rebuild_blk; + uint32_t strip_size; + size_t bsize; + hr_vol_state_t state; + hr_layout_t layout; + hr_metadata_type_t meta_type; + uint8_t vflags; +} hr_vol_info_t; + +extern errno_t hr_sess_init(hr_t **); +extern void hr_sess_destroy(hr_t *); +extern errno_t hr_create(hr_t *, hr_config_t *); +extern errno_t hr_assemble(hr_t *, hr_config_t *, size_t *); +extern errno_t hr_auto_assemble(hr_t *, size_t *); +extern errno_t hr_stop(hr_t *, const char *); +extern errno_t hr_stop_all(hr_t *); +extern errno_t hr_fail_extent(hr_t *, const char *, unsigned long); +extern errno_t hr_add_hotspare(hr_t *, const char *, const char *); +extern errno_t hr_get_vol_states(hr_t *, hr_pair_vol_state_t **, size_t *); +extern errno_t hr_get_vol_info(hr_t *, service_id_t, hr_vol_info_t *); +extern const char *hr_get_vol_state_str(hr_vol_state_t); +extern const char *hr_get_ext_state_str(hr_ext_state_t); +extern const char *hr_get_layout_str(hr_layout_t); +extern const char *hr_get_level_str(hr_level_t); +extern const char *hr_get_metadata_type_str(hr_metadata_type_t); +extern const char *hr_get_vol_flag_str(hr_vol_flag_t); + +#endif + +/** @} + */ diff --git a/uspace/lib/device/include/ipc/hr.h b/uspace/lib/device/include/ipc/hr.h new file mode 100644 index 0000000000..b7d659dc24 --- /dev/null +++ b/uspace/lib/device/include/ipc/hr.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup libdevice + * @{ + */ +/** @file + */ + +#ifndef LIBDEVICE_IPC_HR_H +#define LIBDEVICE_IPC_HR_H + +#include + +typedef enum { + HR_CREATE = IPC_FIRST_USER_METHOD, + HR_ASSEMBLE, + HR_AUTO_ASSEMBLE, + HR_STOP, + HR_STOP_ALL, + HR_FAIL_EXTENT, + HR_ADD_HOTSPARE, + HR_GET_VOL_STATES, + HR_GET_VOL_INFO +} hr_request_t; + +#endif + +/** @} + */ diff --git a/uspace/lib/device/meson.build b/uspace/lib/device/meson.build index ed7659953c..e6ee1a7e33 100644 --- a/uspace/lib/device/meson.build +++ b/uspace/lib/device/meson.build @@ -31,6 +31,7 @@ src = files( 'src/bd_srv.c', 'src/devman.c', 'src/device/led_dev.c', + 'src/hr.c', 'src/io/chardev.c', 'src/io/chardev_srv.c', 'src/io/label.c', diff --git a/uspace/lib/device/src/hr.c b/uspace/lib/device/src/hr.c new file mode 100644 index 0000000000..f770e855ee --- /dev/null +++ b/uspace/lib/device/src/hr.c @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup libdevice + * @{ + */ +/** + * @file HelenRAID client API + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** Initialize server session. + * + * @param rhr Place to store inited session + * + * @return EOK on success or an error code + */ +errno_t hr_sess_init(hr_t **rhr) +{ + errno_t rc; + hr_t *hr = NULL; + + if (rhr == NULL) + return EINVAL; + + hr = calloc(1, sizeof(hr_t)); + if (hr == NULL) { + rc = ENOMEM; + goto error; + } + + service_id_t hr_svcid; + + rc = loc_service_get_id(SERVICE_NAME_HR, &hr_svcid, 0); + if (rc != EOK) + goto error; + + hr->sess = loc_service_connect(hr_svcid, INTERFACE_HR, 0); + if (hr->sess == NULL) { + rc = EIO; + goto error; + } + + *rhr = hr; + return EOK; +error: + if (hr != NULL) + free(hr); + + return rc; +} + +/** Destroy server session. + * + * @param hr Session to destroy + */ +void hr_sess_destroy(hr_t *hr) +{ + if (hr == NULL) + return; + + async_hangup(hr->sess); + free(hr); +} + +/** Create volume. + * + * @param hr Server session + * @param hr_config Config to create from + * + * @return EOK on success or an error code + */ +errno_t hr_create(hr_t *hr, hr_config_t *hr_config) +{ + errno_t rc, retval; + async_exch_t *exch; + aid_t req; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) + return EINVAL; + + req = async_send_0(exch, HR_CREATE, NULL); + + rc = async_data_write_start(exch, hr_config, sizeof(hr_config_t)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + async_exchange_end(exch); + async_wait_for(req, &retval); + return retval; +} + +/** Assemble volumes. + * + * @param hr Server session + * @param hr_config Config to assemble from + * @param rassembled_cnt Place to store assembled count + * + * @return EOK on success or an error code + */ +errno_t hr_assemble(hr_t *hr, hr_config_t *hr_config, size_t *rassembled_cnt) +{ + errno_t rc; + async_exch_t *exch; + aid_t req; + size_t assembled_cnt; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) + return EINVAL; + + req = async_send_0(exch, HR_ASSEMBLE, NULL); + + rc = async_data_write_start(exch, hr_config, sizeof(hr_config_t)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + rc = async_data_read_start(exch, &assembled_cnt, sizeof(size_t)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + async_exchange_end(exch); + async_wait_for(req, &rc); + + if (rassembled_cnt != NULL) + *rassembled_cnt = assembled_cnt; + + return rc; +} + +/** Automatically assemble volumes. + * + * @param hr Server session + * @param rassembled_cnt Place to store assembled count + * + * @return EOK on success or an error code + */ +errno_t hr_auto_assemble(hr_t *hr, size_t *rassembled_cnt) +{ + errno_t rc; + size_t assembled_cnt; + + async_exch_t *exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + aid_t req = async_send_0(exch, HR_AUTO_ASSEMBLE, NULL); + + rc = async_data_read_start(exch, &assembled_cnt, sizeof(size_t)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + async_exchange_end(exch); + async_wait_for(req, &rc); + + if (rassembled_cnt != NULL) + *rassembled_cnt = assembled_cnt; +error: + return rc; +} + +/** Stop/deactivate volume. + * + * @param hr Server session + * @param devname Volume name + * + * @return EOK on success or an error code + */ +errno_t hr_stop(hr_t *hr, const char *devname) +{ + errno_t rc; + async_exch_t *exch; + service_id_t svc_id; + + rc = loc_service_get_id(devname, &svc_id, 0); + if (rc != EOK) + return rc; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + rc = async_req_1_0(exch, HR_STOP, svc_id); + async_exchange_end(exch); +error: + return rc; +} + +/** Stop/deactivate all volumes. + * + * @param hr Server session + * + * @return EOK on success or an error code + */ +errno_t hr_stop_all(hr_t *hr) +{ + async_exch_t *exch; + errno_t rc; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + rc = async_req_0_0(exch, HR_STOP_ALL); + async_exchange_end(exch); +error: + return rc; +} + +/** Fail an extent in volume. + * + * @param hr Server session + * @param volume_name Volume name + * @param extent Extent index to fail + * + * @return EOK on success or an error code + */ +errno_t hr_fail_extent(hr_t *hr, const char *volume_name, unsigned long extent) +{ + errno_t rc; + async_exch_t *exch; + service_id_t vol_svc_id; + + rc = loc_service_get_id(volume_name, &vol_svc_id, 0); + if (rc != EOK) + return rc; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + rc = async_req_2_0(exch, HR_FAIL_EXTENT, vol_svc_id, extent); + async_exchange_end(exch); +error: + return rc; +} + +/** Add a hotspare to volume. + * + * @param hr Server session + * @param volume_name Volume name + * @param hotspare Hotspare service name + * + * @return EOK on success or an error code + */ +errno_t hr_add_hotspare(hr_t *hr, const char *volume_name, const char *hotspare) +{ + errno_t rc; + async_exch_t *exch; + service_id_t vol_svc_id, hs_svc_id; + + rc = loc_service_get_id(volume_name, &vol_svc_id, 0); + if (rc != EOK) + return rc; + + rc = loc_service_get_id(hotspare, &hs_svc_id, 0); + if (rc != EOK) + return rc; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + rc = async_req_2_0(exch, HR_ADD_HOTSPARE, vol_svc_id, hs_svc_id); + async_exchange_end(exch); +error: + return rc; +} + +/** Get state of volumes. + * + * @param hr Server session + * @param rpairs Place to store pointer to (service id, vol state) pairs + * @param rcnt Place to store pair count + * + * @return EOK on success or an error code + */ +errno_t hr_get_vol_states(hr_t *hr, hr_pair_vol_state_t **rpairs, size_t *rcnt) +{ + errno_t rc, retval; + async_exch_t *exch; + aid_t req; + size_t cnt, i; + hr_pair_vol_state_t *pairs = NULL; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + req = async_send_0(exch, HR_GET_VOL_STATES, NULL); + rc = async_data_read_start(exch, &cnt, sizeof(size_t)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + pairs = calloc(cnt, sizeof(*pairs)); + if (pairs == NULL) { + async_exchange_end(exch); + async_forget(req); + return ENOMEM; + } + + for (i = 0; i < cnt; i++) { + rc = async_data_read_start(exch, &pairs[i], sizeof(*pairs)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + goto error; + } + } + + async_exchange_end(exch); + async_wait_for(req, &retval); + if (retval != EOK) { + rc = retval; + goto error; + } + + if (rpairs != NULL) + *rpairs = pairs; + if (rcnt != NULL) + *rcnt = cnt; + return EOK; + +error: + if (pairs != NULL) + free(pairs); + return rc; +} + +/** Get volume info. + * + * @param hr Server session + * @param svc_id Service id of volume + * @param rinfo Place to store volume info + * + * @return EOK on success or an error code + */ +errno_t hr_get_vol_info(hr_t *hr, service_id_t svc_id, hr_vol_info_t *rinfo) +{ + errno_t rc, retval; + async_exch_t *exch; + aid_t req; + + exch = async_exchange_begin(hr->sess); + if (exch == NULL) { + rc = EINVAL; + goto error; + } + + req = async_send_0(exch, HR_GET_VOL_INFO, NULL); + rc = async_data_write_start(exch, &svc_id, sizeof(svc_id)); + if (rc != EOK) { + async_exchange_end(exch); + async_forget(req); + return rc; + } + + rc = async_data_read_start(exch, rinfo, sizeof(*rinfo)); + async_exchange_end(exch); + if (rc != EOK) { + async_forget(req); + goto error; + } + + async_wait_for(req, &retval); + if (retval != EOK) { + rc = retval; + goto error; + } + +error: + return rc; +} + +/** Get volume state string. + * + * @param state State value + * + * @return State string + */ +const char *hr_get_vol_state_str(hr_vol_state_t state) +{ + switch (state) { + case HR_VOL_NONE: + return "NONE/UNKNOWN"; + case HR_VOL_OPTIMAL: + return "OPTIMAL"; + case HR_VOL_FAULTY: + return "FAULTY"; + case HR_VOL_DEGRADED: + return "DEGRADED"; + case HR_VOL_REBUILD: + return "REBUILD"; + default: + return "Invalid state value"; + } +} + +/** Get extent state string. + * + * @param state State value + * + * @return State string + */ +const char *hr_get_ext_state_str(hr_ext_state_t state) +{ + switch (state) { + case HR_EXT_NONE: + return "NONE/UNKNOWN"; + case HR_EXT_INVALID: + return "INVALID"; + case HR_EXT_ONLINE: + return "ONLINE"; + case HR_EXT_MISSING: + return "MISSING"; + case HR_EXT_FAILED: + return "FAILED"; + case HR_EXT_REBUILD: + return "REBUILD"; + case HR_EXT_HOTSPARE: + return "HOTSPARE"; + default: + return "Invalid state value"; + } +} + +/** Get volume layout string. + * + * @param layout Layout value + * + * @return Layout string + */ +const char *hr_get_layout_str(hr_layout_t layout) +{ + switch (layout) { + case HR_LAYOUT_NONE: + return "RAID layout not set"; + case HR_LAYOUT_RAID4_0: + return "RAID-4 Non-Rotating Parity 0"; + case HR_LAYOUT_RAID4_N: + return "RAID-4 Non-Rotating Parity N"; + case HR_LAYOUT_RAID5_0R: + return "RAID-5 Rotating Parity 0 with Data Restart"; + case HR_LAYOUT_RAID5_NR: + return "RAID-5 Rotating Parity N with Data Restart"; + case HR_LAYOUT_RAID5_NC: + return "RAID-5 Rotating Parity N with Data Continuation"; + default: + return "Invalid RAID layout"; + } +} + +/** Get volume level string. + * + * @param level Levelvalue + * + * @return Level string + */ +const char *hr_get_level_str(hr_level_t level) +{ + switch (level) { + case HR_LVL_0: + return "stripe (RAID 0)"; + case HR_LVL_1: + return "mirror (RAID 1)"; + case HR_LVL_4: + return "dedicated parity (RAID 4)"; + case HR_LVL_5: + return "distributed parity (RAID 5)"; + default: + return "Invalid RAID level"; + } +} + +/** Get volume metadata type string. + * + * @param type Metadata type value + * + * @return Metadata type string + */ +const char *hr_get_metadata_type_str(hr_metadata_type_t type) +{ + switch (type) { + case HR_METADATA_NATIVE: + return "HelenRAID native"; + case HR_METADATA_GEOM_MIRROR: + return "GEOM::MIRROR"; + case HR_METADATA_GEOM_STRIPE: + return "GEOM::STRIPE"; + case HR_METADATA_SOFTRAID: + return "OpenBSD softraid"; + case HR_METADATA_MD: + return "Linux Multiple Device"; + case HR_METADATA_NOOP: + return "NOOP Metadata"; + default: + return "Invalid metadata type value"; + } +} + +const char *hr_get_vol_flag_str(hr_vol_flag_t flag) +{ + switch (flag) { + case HR_VOL_FLAG_NOOP_META: + return "--no-meta"; + case HR_VOL_FLAG_READ_ONLY: + return "--read-only"; + default: + return "Invalid flag"; + } +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/fge.c b/uspace/srv/bd/hr/fge.c new file mode 100644 index 0000000000..f2a8fac044 --- /dev/null +++ b/uspace/srv/bd/hr/fge.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * Copyright (c) 2024 Vojtech Horky + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + * @brief Fibril group executor + * + * Fibril pool with pre-allocated storage allowing + * execution of groups consisting of multiple work + * units. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fge.h" +#include "util.h" + +static void *hr_fpool_make_storage(hr_fpool_t *, ssize_t *); +static void hr_fpool_group_epilogue(hr_fpool_t *); +static errno_t fge_fibril(void *); +static errno_t wu_queue_init(wu_queue_t *, size_t); +static void wu_queue_push(wu_queue_t *, fge_fibril_data_t *); +static void wu_queue_pop(wu_queue_t *, fge_fibril_data_t *); +static ssize_t hr_fpool_get_free_slot(hr_fpool_t *); + +hr_fpool_t *hr_fpool_create(size_t fibril_cnt, size_t max_wus, + size_t wu_storage_size) +{ + assert(max_wus > 0 && wu_storage_size > 0); + + void *bitmap_data = NULL; + + hr_fpool_t *result = calloc(1, sizeof(hr_fpool_t)); + if (result == NULL) + return NULL; + + result->fibrils = malloc(sizeof(fid_t) * fibril_cnt); + if (result->fibrils == NULL) + goto bad; + + result->wu_storage = malloc(wu_storage_size * max_wus); + if (result->wu_storage == NULL) + goto bad; + + bitmap_data = calloc(1, bitmap_size(max_wus)); + if (bitmap_data == NULL) + goto bad; + bitmap_initialize(&result->bitmap, max_wus, bitmap_data); + + if (wu_queue_init(&result->queue, max_wus) != EOK) + goto bad; + + fibril_mutex_initialize(&result->lock); + fibril_condvar_initialize(&result->all_wus_done); + + result->max_wus = max_wus; + result->fibril_cnt = fibril_cnt; + result->wu_size = wu_storage_size; + result->wu_storage_free_count = max_wus; + result->stop = false; + result->active_groups = 0; + + for (size_t i = 0; i < fibril_cnt; i++) { + result->fibrils[i] = fibril_create(fge_fibril, result); + fibril_start(result->fibrils[i]); + /* fibril_detach(result->fibrils[i]); */ + } + + return result; +bad: + if (result->queue.fexecs != NULL) + free(result->queue.fexecs); + if (bitmap_data != NULL) + free(bitmap_data); + if (result->wu_storage != NULL) + free(result->wu_storage); + if (result->fibrils != NULL) + free(result->fibrils); + free(result); + + return NULL; +} + +void hr_fpool_destroy(hr_fpool_t *pool) +{ + fibril_mutex_lock(&pool->lock); + pool->stop = true; + while (pool->active_groups > 0) + fibril_condvar_wait(&pool->all_wus_done, &pool->lock); + + fibril_mutex_unlock(&pool->lock); + + free(pool->bitmap.bits); + free(pool->queue.fexecs); + free(pool->wu_storage); + free(pool->fibrils); + free(pool); +} + +hr_fgroup_t *hr_fgroup_create(hr_fpool_t *parent, size_t wu_cnt) +{ + assert(wu_cnt > 0); + + hr_fgroup_t *result = hr_malloc_waitok(sizeof(hr_fgroup_t)); + + result->reserved_cnt = 0; + result->own_mem = NULL; + result->memslots = NULL; + + fibril_mutex_lock(&parent->lock); + + parent->active_groups++; + + if (parent->wu_storage_free_count >= wu_cnt) { + parent->wu_storage_free_count -= wu_cnt; + result->reserved_cnt = wu_cnt; + } else { + /* + * Could be more conservative with memory here and + * allocate space only for one work unit and execute + * work units sequentially like it was first intended with + * the fallback storage. + */ + size_t taking = parent->wu_storage_free_count; + result->own_mem = + hr_malloc_waitok(parent->wu_size * (wu_cnt - taking)); + result->reserved_cnt = taking; + parent->wu_storage_free_count = 0; + } + + if (result->reserved_cnt > 0) { + result->memslots = + hr_malloc_waitok(sizeof(size_t) * result->reserved_cnt); + } + + fibril_mutex_unlock(&parent->lock); + + result->pool = parent; + result->wu_cnt = wu_cnt; + result->submitted = 0; + result->reserved_avail = result->reserved_cnt; + result->own_used = 0; + result->final_errno = EOK; + result->finished_okay = 0; + result->finished_fail = 0; + + fibril_mutex_initialize(&result->lock); + fibril_condvar_initialize(&result->all_done); + + return result; +} + +void *hr_fgroup_alloc(hr_fgroup_t *group) +{ + void *storage; + + fibril_mutex_lock(&group->lock); + + assert(group->submitted < group->wu_cnt); + + if (group->reserved_avail > 0) { + ssize_t memslot; + storage = hr_fpool_make_storage(group->pool, &memslot); + assert(storage != NULL); + group->reserved_avail--; + group->memslots[group->submitted] = memslot; + } else { + assert(group->own_mem != NULL); + storage = + group->own_mem + group->pool->wu_size * group->own_used; + group->own_used++; + } + + fibril_mutex_unlock(&group->lock); + + return storage; +} + +void hr_fgroup_submit(hr_fgroup_t *group, hr_wu_t wu, void *arg) +{ + fibril_mutex_lock(&group->lock); + assert(group->submitted < group->wu_cnt); + + fge_fibril_data_t executor; + executor.wu = wu; + executor.arg = arg; + executor.group = group; + + if (group->submitted < group->reserved_cnt) + executor.memslot = group->memslots[group->submitted]; + else + executor.memslot = -1; + + group->submitted++; + fibril_mutex_unlock(&group->lock); + + wu_queue_push(&group->pool->queue, &executor); +} + +errno_t hr_fgroup_wait(hr_fgroup_t *group, size_t *rokay, size_t *rfailed) +{ + fibril_mutex_lock(&group->lock); + assert(group->submitted <= group->wu_cnt); + + while (true) { + size_t finished = group->finished_fail + group->finished_okay; + if (finished == group->submitted) + break; + + fibril_condvar_wait(&group->all_done, &group->lock); + } + + if (rokay) + *rokay = group->finished_okay; + if (rfailed) + *rfailed = group->finished_fail; + + errno_t rc = group->final_errno; + + fibril_mutex_unlock(&group->lock); + + hr_fpool_group_epilogue(group->pool); + + if (group->memslots != NULL) + free(group->memslots); + if (group->own_mem != NULL) + free(group->own_mem); + free(group); + + return rc; +} + +static void *hr_fpool_make_storage(hr_fpool_t *pool, ssize_t *rmemslot) +{ + fibril_mutex_lock(&pool->lock); + ssize_t memslot = hr_fpool_get_free_slot(pool); + assert(memslot != -1); + + bitmap_set(&pool->bitmap, memslot, 1); + + fibril_mutex_unlock(&pool->lock); + + if (rmemslot) + *rmemslot = memslot; + + return pool->wu_storage + pool->wu_size * memslot; +} + +static void hr_fpool_group_epilogue(hr_fpool_t *pool) +{ + fibril_mutex_lock(&pool->lock); + + pool->active_groups--; + if (pool->active_groups == 0) + fibril_condvar_signal(&pool->all_wus_done); + + fibril_mutex_unlock(&pool->lock); +} + +static errno_t fge_fibril(void *arg) +{ + hr_fpool_t *pool = arg; + while (true) { + fge_fibril_data_t executor; + fibril_mutex_lock(&pool->lock); + + while (circ_buf_nused(&pool->queue.cbuf) == 0 && !pool->stop) { + fibril_condvar_wait(&pool->queue.not_empty, + &pool->lock); + } + + if (pool->stop && circ_buf_nused(&pool->queue.cbuf) == 0) { + fibril_mutex_unlock(&pool->lock); + break; + } + + wu_queue_pop(&pool->queue, &executor); + + fibril_mutex_unlock(&pool->lock); + + hr_fgroup_t *group = executor.group; + + errno_t rc = executor.wu(executor.arg); + + if (rc == EOK) { + fibril_mutex_lock(&group->lock); + group->finished_okay++; + fibril_mutex_unlock(&group->lock); + } else { + fibril_mutex_lock(&group->lock); + group->finished_fail++; + if (rc == EAGAIN) + group->final_errno = EAGAIN; + fibril_mutex_unlock(&group->lock); + } + + fibril_mutex_lock(&pool->lock); + if (executor.memslot > -1) { + bitmap_set(&pool->bitmap, executor.memslot, 0); + pool->wu_storage_free_count++; + } + + fibril_mutex_lock(&group->lock); + size_t finished = group->finished_fail + group->finished_okay; + if (finished == group->submitted) + fibril_condvar_signal(&group->all_done); + fibril_mutex_unlock(&group->lock); + + fibril_mutex_unlock(&pool->lock); + } + return EOK; +} + +static errno_t wu_queue_init(wu_queue_t *queue, size_t nmemb) +{ + queue->fexecs = malloc(sizeof(fge_fibril_data_t) * nmemb); + if (queue->fexecs == NULL) + return ENOMEM; + + circ_buf_init(&queue->cbuf, queue->fexecs, nmemb, + sizeof(fge_fibril_data_t)); + + fibril_mutex_initialize(&queue->lock); + fibril_condvar_initialize(&queue->not_empty); + fibril_condvar_initialize(&queue->not_full); + + return EOK; +} + +static void wu_queue_push(wu_queue_t *queue, fge_fibril_data_t *executor) +{ + fibril_mutex_lock(&queue->lock); + + while (circ_buf_push(&queue->cbuf, executor) == EAGAIN) + fibril_condvar_wait(&queue->not_full, &queue->lock); + + fibril_condvar_signal(&queue->not_empty); + + fibril_mutex_unlock(&queue->lock); +} + +static void wu_queue_pop(wu_queue_t *queue, fge_fibril_data_t *executor) +{ + fibril_mutex_lock(&queue->lock); + + while (circ_buf_pop(&queue->cbuf, executor) == EAGAIN) + fibril_condvar_wait(&queue->not_empty, &queue->lock); + + fibril_condvar_signal(&queue->not_full); + + fibril_mutex_unlock(&queue->lock); +} + +static ssize_t hr_fpool_get_free_slot(hr_fpool_t *pool) +{ + bitmap_t *bitmap = &pool->bitmap; + for (size_t i = 0; i < pool->max_wus; i++) + if (!bitmap_get(bitmap, i)) + return i; + return -1; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/fge.h b/uspace/srv/bd/hr/fge.h new file mode 100644 index 0000000000..089f9d8da7 --- /dev/null +++ b/uspace/srv/bd/hr/fge.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * Copyright (c) 2024 Vojtech Horky + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_FGE_H +#define _HR_FGE_H + +#include +#include +#include +#include + +/* forward declarations */ +typedef struct hr_fpool hr_fpool_t; +typedef struct hr_fgroup hr_fgroup_t; +typedef struct fge_fibril_data fge_fibril_data_t; +typedef struct wu_queue wu_queue_t; + +typedef errno_t (*hr_wu_t)(void *); + +struct fge_fibril_data { + hr_wu_t wu; /* work unit function pointer */ + void *arg; /* work unit function argument */ + hr_fgroup_t *group; /* back-pointer to group */ + ssize_t memslot; /* index to pool bitmap slot */ +}; + +struct wu_queue { + fibril_mutex_t lock; + fibril_condvar_t not_empty; + fibril_condvar_t not_full; + fge_fibril_data_t *fexecs; /* circ-buf memory */ + circ_buf_t cbuf; +}; + +struct hr_fpool { + fibril_mutex_t lock; + bitmap_t bitmap; /* memory slot bitmap */ + wu_queue_t queue; + fid_t *fibrils; + uint8_t *wu_storage; /* pre-allocated pool storage */ + size_t fibril_cnt; + size_t max_wus; + size_t active_groups; + bool stop; + size_t wu_size; + size_t wu_storage_free_count; + fibril_condvar_t all_wus_done; +}; + +struct hr_fgroup { + hr_fpool_t *pool;/* back-pointer to pool */ + size_t wu_cnt;/* upper bound of work units */ + size_t submitted; /* number of submitted jobs */ + size_t reserved_cnt; /* no. of reserved wu storage slots */ + size_t reserved_avail; + size_t *memslots; /* indices to pool bitmap */ + void *own_mem; /* own allocated memory */ + size_t own_used; /* own memory slots used counter */ + errno_t final_errno; /* agreggated errno */ + size_t finished_okay; /* no. of wus that ended with EOK */ + size_t finished_fail; /* no. of wus that ended with != EOK */ + fibril_mutex_t lock; + fibril_condvar_t all_done; +}; + +extern hr_fpool_t *hr_fpool_create(size_t, size_t, size_t); +extern void hr_fpool_destroy(hr_fpool_t *); +extern hr_fgroup_t *hr_fgroup_create(hr_fpool_t *, size_t); +extern void *hr_fgroup_alloc(hr_fgroup_t *); +extern void hr_fgroup_submit(hr_fgroup_t *, hr_wu_t, void *); +extern errno_t hr_fgroup_wait(hr_fgroup_t *, size_t *, size_t *); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/hr.c b/uspace/srv/bd/hr/hr.c new file mode 100644 index 0000000000..f2938d4594 --- /dev/null +++ b/uspace/srv/bd/hr/hr.c @@ -0,0 +1,693 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file hr.c + * @brief HelenRAID server methods. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "var.h" + +static void hr_assemble_srv(ipc_call_t *); +static void hr_auto_assemble_srv(ipc_call_t *); +static void hr_stop_srv(ipc_call_t *); +static void hr_stop_all_srv(ipc_call_t *); +static void hr_add_hotspare_srv(ipc_call_t *); +static void hr_get_vol_states_srv(ipc_call_t *); +static void hr_ctl_conn(ipc_call_t *); +static void hr_call_handler(ipc_call_t *, void *); + +loc_srv_t *hr_srv; +list_t hr_volumes; +fibril_rwlock_t hr_volumes_lock; + +static service_id_t ctl_sid; + +/** Volume creation (server). + * + * Creates HelenRAID volume from parameters and + * devices specified in hr_config_t. + * + * @param icall hr_config_t + */ +static void hr_create_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t i, size; + hr_config_t *cfg; + hr_volume_t *vol; + ipc_call_t call; + + if (!async_data_write_receive(&call, &size)) { + async_answer_0(&call, EREFUSED); + async_answer_0(icall, EREFUSED); + return; + } + + if (size != sizeof(hr_config_t)) { + async_answer_0(&call, EINVAL); + async_answer_0(icall, EINVAL); + return; + } + + cfg = calloc(1, sizeof(hr_config_t)); + if (cfg == NULL) { + async_answer_0(&call, ENOMEM); + async_answer_0(icall, ENOMEM); + return; + } + + rc = async_data_write_finalize(&call, cfg, size); + if (rc != EOK) { + free(cfg); + async_answer_0(&call, rc); + async_answer_0(icall, rc); + return; + } + + if (cfg->dev_no > HR_MAX_EXTENTS) { + HR_ERROR("provided %u devices (max = %u)", + (unsigned)cfg->dev_no, HR_MAX_EXTENTS); + free(cfg); + async_answer_0(icall, ELIMIT); + return; + } + + /* + * If there was a missing device provided + * for creation of a new volume, abort + */ + for (i = 0; i < cfg->dev_no; i++) { + if (cfg->devs[i] == 0) { + HR_ERROR("missing device provided for volume " + "creation, aborting"); + free(cfg); + async_answer_0(icall, EINVAL); + return; + } + } + + hr_metadata_type_t meta_type; + if (cfg->vol_flags & HR_VOL_FLAG_NOOP_META) + meta_type = HR_METADATA_NOOP; + else + meta_type = HR_METADATA_NATIVE; + + rc = hr_create_vol_struct(&vol, cfg->level, cfg->devname, meta_type, + cfg->vol_flags); + if (rc != EOK) { + free(cfg); + async_answer_0(icall, rc); + return; + } + + rc = hr_init_extents_from_cfg(vol, cfg); + if (rc != EOK) + goto error; + + vol->hr_ops.init(vol); + if (rc != EOK) + goto error; + + rc = vol->meta_ops->init_vol2meta(vol); + if (rc != EOK) + goto error; + + rc = vol->hr_ops.create(vol); + if (rc != EOK) + goto error; + + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + + rc = hr_register_volume(vol); + if (rc != EOK) + goto error; + + fibril_rwlock_write_lock(&hr_volumes_lock); + list_append(&vol->lvolumes, &hr_volumes); + fibril_rwlock_write_unlock(&hr_volumes_lock); + + HR_NOTE("created volume \"%s\" (%" PRIun ")\n", vol->devname, + vol->svc_id); + + free(cfg); + async_answer_0(icall, rc); + return; +error: + free(cfg); + hr_destroy_vol_struct(vol); + async_answer_0(icall, rc); +} + +/** Manual volume assembly (server). + * + * Tries to assemble a volume from devices in hr_config_t and + * sends the number of successful volumes assembled back to the + * client. + * + * @param icall hr_config_t + */ +static void hr_assemble_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t size, assembled_cnt; + hr_config_t *cfg; + ipc_call_t call; + + if (!async_data_write_receive(&call, &size)) { + async_answer_0(&call, EREFUSED); + async_answer_0(icall, EREFUSED); + return; + } + + if (size != sizeof(hr_config_t)) { + async_answer_0(&call, EINVAL); + async_answer_0(icall, EINVAL); + return; + } + + cfg = calloc(1, sizeof(hr_config_t)); + if (cfg == NULL) { + async_answer_0(&call, ENOMEM); + async_answer_0(icall, ENOMEM); + return; + } + + rc = async_data_write_finalize(&call, cfg, size); + if (rc != EOK) + goto error; + + if (!async_data_read_receive(&call, &size)) { + async_answer_0(icall, EREFUSED); + return; + } + + if (size != sizeof(size_t)) { + async_answer_0(icall, EINVAL); + return; + } + + rc = hr_util_try_assemble(cfg, &assembled_cnt); + if (rc != EOK) + goto error; + + rc = async_data_read_finalize(&call, &assembled_cnt, size); + if (rc != EOK) + goto error; + + free(cfg); + async_answer_0(icall, EOK); + return; +error: + free(cfg); + async_answer_0(&call, rc); + async_answer_0(icall, rc); +} + +/** Automatic volume assembly (server). + * + * Tries to assemble a volume from devices in disk location + * category and sends the number of successful volumes assembled + * back to client. + */ +static void hr_auto_assemble_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t size; + size_t assembled_cnt = 0; + ipc_call_t call; + + if (!async_data_read_receive(&call, &size)) { + async_answer_0(icall, EREFUSED); + return; + } + + if (size != sizeof(size_t)) { + async_answer_0(&call, EINVAL); + async_answer_0(icall, EINVAL); + return; + } + + rc = hr_util_try_assemble(NULL, &assembled_cnt); + if (rc != EOK) + goto error; + + rc = async_data_read_finalize(&call, &assembled_cnt, size); + if (rc != EOK) + goto error; + + async_answer_0(icall, EOK); + return; +error: + async_answer_0(&call, rc); + async_answer_0(icall, rc); +} + +/** Volume deactivation (server). + * + * Deactivates/detaches specified volume. + */ +static void hr_stop_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + service_id_t svc_id; + + svc_id = ipc_get_arg1(icall); + + rc = hr_remove_volume(svc_id); + + async_answer_0(icall, rc); +} + +/** Automatic volume deactivation (server). + * + * Tries to deactivate/detach all volumes. + */ +static void hr_stop_all_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + service_id_t *vol_svcs = NULL; + errno_t rc = EOK; + size_t i, vol_cnt; + + rc = hr_get_volume_svcs(&vol_cnt, &vol_svcs); + if (rc != EOK) + goto fail; + + for (i = 0; i < vol_cnt; i++) { + errno_t rc2 = hr_remove_volume(vol_svcs[i]); + if (rc2 == EBUSY) + rc = EBUSY; + } + +fail: + if (vol_svcs != NULL) + free(vol_svcs); + async_answer_0(icall, rc); +} + +/** Simulate volume extent failure (server). + * + * Changes the specified extent's state to FAULTY. + * Other extents' metadata are marked as dirty, therefore + * it effectively invalides the specified extent as well + * for further uses. + */ +static void hr_fail_extent_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + service_id_t svc_id; + size_t extent_idx_to_fail; + hr_volume_t *vol; + + svc_id = (service_id_t)ipc_get_arg1(icall); + extent_idx_to_fail = (size_t)ipc_get_arg2(icall); + + vol = hr_get_volume(svc_id); + if (vol == NULL) { + async_answer_0(icall, ENOENT); + return; + } + + fibril_rwlock_write_lock(&vol->extents_lock); + fibril_rwlock_write_lock(&vol->states_lock); + + hr_extent_t *ext = &vol->extents[extent_idx_to_fail]; + + switch (ext->state) { + case HR_EXT_NONE: + case HR_EXT_MISSING: + case HR_EXT_FAILED: + fibril_rwlock_write_unlock(&vol->states_lock); + fibril_rwlock_write_unlock(&vol->extents_lock); + async_answer_0(icall, EINVAL); + return; + default: + hr_update_ext_state(vol, extent_idx_to_fail, HR_EXT_FAILED); + (void)vol->meta_ops->erase_block(ext->svc_id); + block_fini(ext->svc_id); + ext->svc_id = 0; + hr_mark_vol_state_dirty(vol); + } + + fibril_rwlock_write_unlock(&vol->states_lock); + fibril_rwlock_write_unlock(&vol->extents_lock); + + vol->hr_ops.vol_state_eval(vol); + + async_answer_0(icall, EOK); +} + +/** Add hotspare to volume (server). + * + * Adds hotspare to a volume. + */ +static void hr_add_hotspare_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + service_id_t vol_svc_id; + service_id_t hotspare; + hr_volume_t *vol; + + vol_svc_id = ipc_get_arg1(icall); + hotspare = ipc_get_arg2(icall); + + vol = hr_get_volume(vol_svc_id); + if (vol == NULL) { + async_answer_0(icall, ENOENT); + return; + } + + if (vol->level == HR_LVL_0) { + HR_NOTE("hotspare not supported on RAID level = %s\n", + hr_get_level_str(vol->level)); + async_answer_0(icall, ENOTSUP); + return; + } + + if (!(vol->meta_ops->get_flags() & HR_METADATA_HOTSPARE_SUPPORT)) { + HR_NOTE("hotspare not supported on metadata type = %s\n", + hr_get_metadata_type_str(vol->meta_ops->get_type())); + async_answer_0(icall, ENOTSUP); + return; + } + + rc = hr_util_add_hotspare(vol, hotspare); + + vol->hr_ops.vol_state_eval(vol); + + async_answer_0(icall, rc); +} + +/** Send volume states. + * + * Sends the client pairs of (volume service_id, state). + */ +static void hr_get_vol_states_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t vol_cnt = 0; + hr_pair_vol_state_t pair; + ipc_call_t call; + size_t size; + + fibril_rwlock_read_lock(&hr_volumes_lock); + + vol_cnt = list_count(&hr_volumes); + + if (!async_data_read_receive(&call, &size)) { + rc = EREFUSED; + goto error; + } + + if (size != sizeof(vol_cnt)) { + rc = EINVAL; + goto error; + } + + rc = async_data_read_finalize(&call, &vol_cnt, size); + if (rc != EOK) + goto error; + + list_foreach(hr_volumes, lvolumes, hr_volume_t, vol) { + pair.svc_id = vol->svc_id; + pair.state = vol->state; + + if (!async_data_read_receive(&call, &size)) { + rc = EREFUSED; + goto error; + } + + if (size != sizeof(pair)) { + rc = EINVAL; + goto error; + } + + rc = async_data_read_finalize(&call, &pair, size); + if (rc != EOK) + goto error; + } + + fibril_rwlock_read_unlock(&hr_volumes_lock); + async_answer_0(icall, EOK); + return; +error: + fibril_rwlock_read_unlock(&hr_volumes_lock); + async_answer_0(&call, rc); + async_answer_0(icall, rc); +} + +/** Send volume info. + * + * Sends the client volume info. + */ +static void hr_get_vol_info_srv(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t size; + ipc_call_t call; + service_id_t svc_id; + hr_vol_info_t info; + hr_volume_t *vol; + + if (!async_data_write_receive(&call, &size)) { + rc = EREFUSED; + goto error; + } + + if (size != sizeof(service_id_t)) { + rc = EINVAL; + goto error; + } + + rc = async_data_write_finalize(&call, &svc_id, size); + if (rc != EOK) + goto error; + + vol = hr_get_volume(svc_id); + if (vol == NULL) { + rc = ENOENT; + goto error; + } + + memcpy(info.extents, vol->extents, + sizeof(hr_extent_t) * HR_MAX_EXTENTS); + memcpy(info.hotspares, vol->hotspares, + sizeof(hr_extent_t) * HR_MAX_HOTSPARES); + info.svc_id = vol->svc_id; + info.extent_no = vol->extent_no; + info.hotspare_no = vol->hotspare_no; + info.level = vol->level; + info.data_blkno = vol->data_blkno; + info.rebuild_blk = vol->rebuild_blk; + info.strip_size = vol->strip_size; + info.bsize = vol->bsize; + info.state = vol->state; + info.layout = vol->layout; + info.meta_type = vol->meta_ops->get_type(); + memcpy(info.devname, vol->devname, HR_DEVNAME_LEN); + info.vflags = vol->vflags; + + if (!async_data_read_receive(&call, &size)) { + rc = EREFUSED; + goto error; + } + + if (size != sizeof(info)) { + rc = EINVAL; + goto error; + } + + rc = async_data_read_finalize(&call, &info, size); + if (rc != EOK) + goto error; + + async_answer_0(icall, EOK); + return; +error: + async_answer_0(&call, rc); + async_answer_0(icall, rc); +} + +/** HelenRAID server control IPC methods crossroad. + */ +static void hr_ctl_conn(ipc_call_t *icall) +{ + HR_DEBUG("%s()", __func__); + + async_accept_0(icall); + + while (true) { + ipc_call_t call; + async_get_call(&call); + sysarg_t method = ipc_get_imethod(&call); + + if (!method) { + async_answer_0(&call, EOK); + return; + } + + switch (method) { + case HR_CREATE: + hr_create_srv(&call); + break; + case HR_ASSEMBLE: + hr_assemble_srv(&call); + break; + case HR_AUTO_ASSEMBLE: + hr_auto_assemble_srv(&call); + break; + case HR_STOP: + hr_stop_srv(&call); + break; + case HR_STOP_ALL: + hr_stop_all_srv(&call); + break; + case HR_FAIL_EXTENT: + hr_fail_extent_srv(&call); + break; + case HR_ADD_HOTSPARE: + hr_add_hotspare_srv(&call); + break; + case HR_GET_VOL_STATES: + hr_get_vol_states_srv(&call); + break; + case HR_GET_VOL_INFO: + hr_get_vol_info_srv(&call); + break; + default: + async_answer_0(&call, EINVAL); + } + } +} + +/** HelenRAID server IPC method crossroad. + * + * Distinguishes between control IPC and block device + * IPC calls. + */ +static void hr_call_handler(ipc_call_t *icall, void *arg) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol; + + service_id_t svc_id = ipc_get_arg2(icall); + + if (svc_id == ctl_sid) { + hr_ctl_conn(icall); + } else { + vol = hr_get_volume(svc_id); + if (vol == NULL) { + async_answer_0(icall, ENOENT); + return; + } + bd_conn(icall, &vol->hr_bds); + } +} + +int main(int argc, char **argv) +{ + errno_t rc; + + printf("%s: HelenRAID server\n", NAME); + + rc = log_init(NAME); + if (rc != EOK) { + printf("%s: failed to initialize logging\n", NAME); + return 1; + } + + fibril_rwlock_initialize(&hr_volumes_lock); + list_initialize(&hr_volumes); + + async_set_fallback_port_handler(hr_call_handler, NULL); + + rc = loc_server_register(NAME, &hr_srv); + if (rc != EOK) { + HR_ERROR("failed registering server: %s", str_error(rc)); + return EEXIST; + } + + rc = loc_service_register(hr_srv, SERVICE_NAME_HR, fallback_port_id, + &ctl_sid); + if (rc != EOK) { + HR_ERROR("failed registering service: %s", str_error(rc)); + return EEXIST; + } + + printf("%s: Trying automatic assembly.\n", NAME); + size_t assembled = 0; + (void)hr_util_try_assemble(NULL, &assembled); + printf("%s: Assembled %zu volume(s).\n", NAME, assembled); + + printf("%s: Accepting connections.\n", NAME); + task_retval(0); + async_manager(); + + return 0; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/io.c b/uspace/srv/bd/hr/io.c new file mode 100644 index 0000000000..8baa7c5706 --- /dev/null +++ b/uspace/srv/bd/hr/io.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io.h" +#include "parity_stripe.h" +#include "util.h" +#include "var.h" + +/** Wrapper for block_write_direct(), never returns ENOMEM */ +errno_t hr_write_direct(service_id_t service_id, uint64_t ba, size_t cnt, + const void *data) +{ + errno_t rc; + while ((rc = block_write_direct(service_id, ba, cnt, data)) == ENOMEM) + fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */ + + if (rc == EAGAIN) + rc = EIO; + + return rc; +} + +/** Wrapper for block_read_direct(), never returns ENOMEM */ +errno_t hr_read_direct(service_id_t service_id, uint64_t ba, size_t cnt, + void *data) +{ + errno_t rc; + while ((rc = block_read_direct(service_id, ba, cnt, data)) == ENOMEM) + fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */ + + if (rc == EAGAIN) + rc = EIO; + + return rc; +} + +/** Wrapper for block_sync_cache(), never returns ENOMEM */ +errno_t hr_sync_cache(service_id_t service_id, uint64_t ba, size_t cnt) +{ + errno_t rc; + while ((rc = block_sync_cache(service_id, ba, cnt)) == ENOMEM) + fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */ + + if (rc == EAGAIN) + rc = EIO; + + return rc; +} + +errno_t hr_io_worker(void *arg) +{ + hr_io_t *io = arg; + + errno_t rc; + size_t e = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + switch (io->type) { + case HR_BD_READ: + rc = hr_read_direct(extents[e].svc_id, io->ba, io->cnt, + io->data_read); + break; + case HR_BD_WRITE: + rc = hr_write_direct(extents[e].svc_id, io->ba, io->cnt, + io->data_write); + break; + default: + assert(0); + } + + if (io->vol->level == HR_LVL_1) { + atomic_store_explicit(&io->vol->last_ext_pos_arr[e], + io->ba + io->cnt - 1, memory_order_relaxed); + } + + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + return rc; +} + +errno_t hr_io_raid5_basic_reader(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + rc = hr_read_direct(extents[ext_idx].svc_id, io->ba, io->cnt, + io->data_read); + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + return rc; +} + +errno_t hr_io_raid5_reader(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + rc = hr_read_direct(extents[ext_idx].svc_id, io->ba, io->cnt, + io->data_read); + if (rc != EOK) { + hr_stripe_parity_abort(stripe); + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + } + + hr_stripe_commit_parity(stripe, io->strip_off, io->data_read, + io->cnt * io->vol->bsize); + + return rc; +} + +errno_t hr_io_raid5_basic_writer(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + rc = hr_write_direct(extents[ext_idx].svc_id, io->ba, io->cnt, + io->data_write); + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + return rc; +} + +errno_t hr_io_raid5_writer(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + hr_stripe_commit_parity(stripe, io->strip_off, io->data_write, + io->cnt * io->vol->bsize); + + hr_stripe_wait_for_parity_commits(stripe); + if (stripe->abort) + return EAGAIN; + + rc = hr_write_direct(extents[ext_idx].svc_id, io->ba, io->cnt, + io->data_write); + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + return rc; +} + +errno_t hr_io_raid5_noop_writer(void *arg) +{ + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + hr_stripe_commit_parity(stripe, io->strip_off, io->data_write, + io->cnt * io->vol->bsize); + + return EOK; +} + +errno_t hr_io_raid5_parity_getter(void *arg) +{ + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + size_t bsize = stripe->vol->bsize; + + hr_stripe_wait_for_parity_commits(stripe); + if (stripe->abort) + return EAGAIN; + + memcpy(io->data_read, stripe->parity + io->strip_off, io->cnt * bsize); + + return EOK; +} + +errno_t hr_io_raid5_subtract_writer(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + uint8_t *data = hr_malloc_waitok(io->cnt * io->vol->bsize); + + rc = hr_read_direct(extents[ext_idx].svc_id, io->ba, io->cnt, data); + if (rc != EOK) { + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + hr_stripe_parity_abort(stripe); + free(data); + return rc; + } + + fibril_mutex_lock(&stripe->parity_lock); + + hr_raid5_xor(stripe->parity + io->strip_off, data, + io->cnt * io->vol->bsize); + + hr_raid5_xor(stripe->parity + io->strip_off, io->data_write, + io->cnt * io->vol->bsize); + + stripe->ps_added++; + fibril_condvar_broadcast(&stripe->ps_added_cv); + fibril_mutex_unlock(&stripe->parity_lock); + + hr_stripe_wait_for_parity_commits(stripe); + if (stripe->abort) + return EAGAIN; + + rc = hr_write_direct(extents[ext_idx].svc_id, io->ba, io->cnt, + io->data_write); + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + free(data); + + return rc; +} + +errno_t hr_io_raid5_reconstruct_reader(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + size_t ext_idx = io->extent; + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + uint8_t *data = hr_malloc_waitok(io->cnt * io->vol->bsize); + + rc = hr_read_direct(extents[ext_idx].svc_id, io->ba, io->cnt, data); + if (rc != EOK) { + hr_stripe_parity_abort(stripe); + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + free(data); + return rc; + } + + hr_stripe_commit_parity(stripe, io->strip_off, data, + io->cnt * io->vol->bsize); + + free(data); + + return EOK; +} + +errno_t hr_io_raid5_parity_writer(void *arg) +{ + errno_t rc; + + hr_io_raid5_t *io = arg; + hr_stripe_t *stripe = io->stripe; + + hr_extent_t *extents = (hr_extent_t *)&io->vol->extents; + + hr_stripe_wait_for_parity_commits(stripe); + + if (stripe->abort) + return EAGAIN; + + rc = hr_write_direct(extents[io->extent].svc_id, io->ba, io->cnt, + stripe->parity + io->strip_off); + if (rc != EOK) + io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/io.h b/uspace/srv/bd/hr/io.h new file mode 100644 index 0000000000..3bbfb81c9a --- /dev/null +++ b/uspace/srv/bd/hr/io.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_IO_H +#define _HR_IO_H + +#include "parity_stripe.h" +#include "var.h" +#include "util.h" + +typedef struct hr_io { + hr_bd_op_type_t type; /* read/write */ + uint64_t ba; + uint64_t cnt; + void *data_read; + const void *data_write; + size_t extent; /* extent index */ + hr_volume_t *vol; /* volume back-pointer */ +} hr_io_t; + +typedef struct hr_io_raid5 { + uint64_t ba; + uint64_t cnt; + void *data_read; + const void *data_write; + size_t extent; + uint64_t strip_off; /* needed for offseting parity commits */ + hr_stripe_t *stripe; + hr_volume_t *vol; +} hr_io_raid5_t; + +extern errno_t hr_write_direct(service_id_t, uint64_t, size_t, const void *); +extern errno_t hr_read_direct(service_id_t, uint64_t, size_t, void *); +extern errno_t hr_sync_cache(service_id_t, uint64_t, size_t); + +extern errno_t hr_io_worker(void *); + +extern errno_t hr_io_raid5_basic_reader(void *); +extern errno_t hr_io_raid5_reader(void *); +extern errno_t hr_io_raid5_basic_writer(void *); +extern errno_t hr_io_raid5_writer(void *); +extern errno_t hr_io_raid5_noop_writer(void *); +extern errno_t hr_io_raid5_parity_getter(void *); +extern errno_t hr_io_raid5_subtract_writer(void *); +extern errno_t hr_io_raid5_reconstruct_reader(void *); +extern errno_t hr_io_raid5_parity_writer(void *); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/meson.build b/uspace/srv/bd/hr/meson.build new file mode 100644 index 0000000000..de7f015ef7 --- /dev/null +++ b/uspace/srv/bd/hr/meson.build @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025 Miroslav Cimerman +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +deps = [ 'block', 'crypto', 'device' ] +src = files( + 'fge.c', + 'hr.c', + 'io.c', + 'metadata/foreign/geom/hr_g_mirror.c', + 'metadata/foreign/geom/hr_g_stripe.c', + 'metadata/foreign/md/hr_md.c', + 'metadata/foreign/softraid/hr_softraid.c', + 'metadata/foreign/softraid/softraid.c', + 'metadata/native.c', + 'metadata/noop.c', + 'parity_stripe.c', + 'raid0.c', + 'raid1.c', + 'raid5.c', + 'superblock.c', + 'util.c' + ) diff --git a/uspace/srv/bd/hr/metadata/foreign/geom/g_mirror.h b/uspace/srv/bd/hr/metadata/foreign/geom/g_mirror.h new file mode 100644 index 0000000000..a49fde9ad1 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/geom/g_mirror.h @@ -0,0 +1,286 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2004-2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_METADATA_FOREIGN_GEOM_MIRROR_H +#define _HR_METADATA_FOREIGN_GEOM_MIRROR_H + +/* needed HelenOS headers */ +#include +#include +#include + +/* new typedefs */ +typedef unsigned char u_char; +typedef unsigned int u_int; +#define bcopy(src, dst, len) memcpy(dst, src, len) + +/* needed FreeBSD header */ +#include "sys_endian.h" + +/* here continues the stripped down original header */ + +#define G_MIRROR_MAGIC "GEOM::MIRROR" + +#define G_MIRROR_BALANCE_NONE 0 +#define G_MIRROR_BALANCE_ROUND_ROBIN 1 +#define G_MIRROR_BALANCE_LOAD 2 +#define G_MIRROR_BALANCE_SPLIT 3 +#define G_MIRROR_BALANCE_PREFER 4 +#define G_MIRROR_BALANCE_MIN G_MIRROR_BALANCE_NONE +#define G_MIRROR_BALANCE_MAX G_MIRROR_BALANCE_PREFER + +#define G_MIRROR_DISK_FLAG_DIRTY 0x0000000000000001ULL +#define G_MIRROR_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL +#define G_MIRROR_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL +#define G_MIRROR_DISK_FLAG_INACTIVE 0x0000000000000008ULL +#define G_MIRROR_DISK_FLAG_HARDCODED 0x0000000000000010ULL +#define G_MIRROR_DISK_FLAG_BROKEN 0x0000000000000020ULL +#define G_MIRROR_DISK_FLAG_CANDELETE 0x0000000000000040ULL + +#define G_MIRROR_DISK_FLAG_MASK (G_MIRROR_DISK_FLAG_DIRTY | \ + G_MIRROR_DISK_FLAG_SYNCHRONIZING | \ + G_MIRROR_DISK_FLAG_FORCE_SYNC | \ + G_MIRROR_DISK_FLAG_INACTIVE | \ + G_MIRROR_DISK_FLAG_CANDELETE) + +#define G_MIRROR_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL +#define G_MIRROR_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL + +#define G_MIRROR_DEVICE_FLAG_DESTROY 0x0100000000000000ULL +#define G_MIRROR_DEVICE_FLAG_DRAIN 0x0200000000000000ULL +#define G_MIRROR_DEVICE_FLAG_CLOSEWAIT 0x0400000000000000ULL +#define G_MIRROR_DEVICE_FLAG_TASTING 0x0800000000000000ULL +#define G_MIRROR_DEVICE_FLAG_WIPE 0x1000000000000000ULL + +struct g_mirror_metadata { + char md_magic[16]; /* Magic value. */ + uint32_t md_version; /* Version number. */ + char md_name[16]; /* Mirror name. */ + uint32_t md_mid; /* Mirror unique ID. */ + uint32_t md_did; /* Disk unique ID. */ + uint8_t md_all; /* Number of disks in mirror. */ + uint32_t md_genid; /* Generation ID. */ + uint32_t md_syncid; /* Synchronization ID. */ + uint8_t md_priority; /* Disk priority. */ + uint32_t md_slice; /* Slice size. */ + uint8_t md_balance; /* Balance type. */ + uint64_t md_mediasize; /* Size of the smallest disk in mirror. */ + uint32_t md_sectorsize; /* Sector size. */ + uint64_t md_sync_offset; /* Synchronized offset. */ + uint64_t md_mflags; /* Additional mirror flags. */ + uint64_t md_dflags; /* Additional disk flags. */ + char md_provider[16]; /* Hardcoded provider. */ + uint64_t md_provsize; /* Provider's size. */ + u_char md_hash[16]; /* MD5 hash. */ +}; + +static __inline void +mirror_metadata_encode(struct g_mirror_metadata *md, u_char *data) +{ + uint8_t md5_hash[16]; + + bcopy(md->md_magic, data, 16); + le32enc(data + 16, md->md_version); + bcopy(md->md_name, data + 20, 16); + le32enc(data + 36, md->md_mid); + le32enc(data + 40, md->md_did); + *(data + 44) = md->md_all; + le32enc(data + 45, md->md_genid); + le32enc(data + 49, md->md_syncid); + *(data + 53) = md->md_priority; + le32enc(data + 54, md->md_slice); + *(data + 58) = md->md_balance; + le64enc(data + 59, md->md_mediasize); + le32enc(data + 67, md->md_sectorsize); + le64enc(data + 71, md->md_sync_offset); + le64enc(data + 79, md->md_mflags); + le64enc(data + 87, md->md_dflags); + bcopy(md->md_provider, data + 95, 16); + le64enc(data + 111, md->md_provsize); + + errno_t rc = create_hash(data, 119, md5_hash, HASH_MD5); + assert(rc == EOK); + bcopy(md5_hash, data + 119, 16); +} + +static __inline int +mirror_metadata_decode_v3v4(const u_char *data, struct g_mirror_metadata *md) +{ + uint8_t md5_hash[16]; + + bcopy(data + 20, md->md_name, 16); + md->md_mid = le32dec(data + 36); + md->md_did = le32dec(data + 40); + md->md_all = *(data + 44); + md->md_genid = le32dec(data + 45); + md->md_syncid = le32dec(data + 49); + md->md_priority = *(data + 53); + md->md_slice = le32dec(data + 54); + md->md_balance = *(data + 58); + md->md_mediasize = le64dec(data + 59); + md->md_sectorsize = le32dec(data + 67); + md->md_sync_offset = le64dec(data + 71); + md->md_mflags = le64dec(data + 79); + md->md_dflags = le64dec(data + 87); + bcopy(data + 95, md->md_provider, 16); + md->md_provsize = le64dec(data + 111); + bcopy(data + 119, md->md_hash, 16); + + errno_t rc = create_hash(data, 119, md5_hash, HASH_MD5); + assert(rc == EOK); + if (memcmp(md->md_hash, md5_hash, 16) != 0) + return (EINVAL); + + return (0); +} +static __inline int +mirror_metadata_decode(const u_char *data, struct g_mirror_metadata *md) +{ + int error; + + bcopy(data, md->md_magic, 16); + if (str_lcmp(md->md_magic, G_MIRROR_MAGIC, 16) != 0) + return (EINVAL); + + md->md_version = le32dec(data + 16); + switch (md->md_version) { + case 4: + error = mirror_metadata_decode_v3v4(data, md); + break; + default: + error = EINVAL; + break; + } + return (error); +} + +static __inline const char * +balance_name(u_int balance) +{ + static const char *algorithms[] = { + [G_MIRROR_BALANCE_NONE] = "none", + [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", + [G_MIRROR_BALANCE_LOAD] = "load", + [G_MIRROR_BALANCE_SPLIT] = "split", + [G_MIRROR_BALANCE_PREFER] = "prefer", + [G_MIRROR_BALANCE_MAX + 1] = "unknown" + }; + + if (balance > G_MIRROR_BALANCE_MAX) + balance = G_MIRROR_BALANCE_MAX + 1; + + return (algorithms[balance]); +} + +static __inline int +balance_id(const char *name) +{ + static const char *algorithms[] = { + [G_MIRROR_BALANCE_NONE] = "none", + [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", + [G_MIRROR_BALANCE_LOAD] = "load", + [G_MIRROR_BALANCE_SPLIT] = "split", + [G_MIRROR_BALANCE_PREFER] = "prefer" + }; + int n; + + for (n = G_MIRROR_BALANCE_MIN; n <= G_MIRROR_BALANCE_MAX; n++) { + if (str_cmp(name, algorithms[n]) == 0) + return (n); + } + return (-1); +} + +static __inline void +mirror_metadata_dump(const struct g_mirror_metadata *md) +{ + static const char hex[] = "0123456789abcdef"; + char hash[16 * 2 + 1]; + u_int i; + + printf(" magic: %s\n", md->md_magic); + printf(" version: %u\n", (u_int)md->md_version); + printf(" name: %s\n", md->md_name); + printf(" mid: %u\n", (u_int)md->md_mid); + printf(" did: %u\n", (u_int)md->md_did); + printf(" all: %u\n", (u_int)md->md_all); + printf(" genid: %u\n", (u_int)md->md_genid); + printf(" syncid: %u\n", (u_int)md->md_syncid); + printf(" priority: %u\n", (u_int)md->md_priority); + printf(" slice: %u\n", (u_int)md->md_slice); + printf(" balance: %s\n", balance_name((u_int)md->md_balance)); + printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); + printf("sectorsize: %u\n", (u_int)md->md_sectorsize); + printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); + printf(" mflags:"); + if (md->md_mflags == 0) + printf(" NONE"); + else { + if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) + printf(" NOFAILSYNC"); + if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) + printf(" NOAUTOSYNC"); + } + printf("\n"); + printf(" dflags:"); + if (md->md_dflags == 0) + printf(" NONE"); + else { + if ((md->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) != 0) + printf(" DIRTY"); + if ((md->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) + printf(" SYNCHRONIZING"); + if ((md->md_dflags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) + printf(" FORCE_SYNC"); + if ((md->md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) + printf(" INACTIVE"); + } + printf("\n"); + printf("hcprovider: %s\n", md->md_provider); + printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); + /* bzero(hash, sizeof(hash)); */ + memset(hash, 0, sizeof(hash)); + + for (i = 0; i < 16; i++) { + hash[i * 2] = hex[md->md_hash[i] >> 4]; + hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; + } + printf(" MD5 hash: %s\n", hash); +} + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/geom/g_stripe.h b/uspace/srv/bd/hr/metadata/foreign/geom/g_stripe.h new file mode 100644 index 0000000000..8495142d6b --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/geom/g_stripe.h @@ -0,0 +1,102 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2004-2005 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_METADATA_FOREIGN_GEOM_STRIPE_H +#define _HR_METADATA_FOREIGN_GEOM_STRIPE_H + +/* needed HelenOS headers */ +#include +#include +#include + +/* new typedefs */ +typedef unsigned char u_char; +typedef unsigned int u_int; +#define bcopy(src, dst, len) memcpy(dst, src, len) + +/* needed FreeBSD header */ +#include "sys_endian.h" + +/* here continues the stripped down original header */ + +#define G_STRIPE_MAGIC "GEOM::STRIPE" + +#define G_STRIPE_VERSION 3 + +struct g_stripe_metadata { + char md_magic[16]; /* Magic value. */ + uint32_t md_version; /* Version number. */ + char md_name[16]; /* Stripe name. */ + uint32_t md_id; /* Unique ID. */ + uint16_t md_no; /* Disk number. */ + uint16_t md_all; /* Number of all disks. */ + uint32_t md_stripesize; /* Stripe size. */ + char md_provider[16]; /* Hardcoded provider. */ + uint64_t md_provsize; /* Provider's size. */ +}; + +static __inline void +stripe_metadata_encode(const struct g_stripe_metadata *md, u_char *data) +{ + + bcopy(md->md_magic, data, sizeof(md->md_magic)); + le32enc(data + 16, md->md_version); + bcopy(md->md_name, data + 20, sizeof(md->md_name)); + le32enc(data + 36, md->md_id); + le16enc(data + 40, md->md_no); + le16enc(data + 42, md->md_all); + le32enc(data + 44, md->md_stripesize); + bcopy(md->md_provider, data + 48, sizeof(md->md_provider)); + le64enc(data + 64, md->md_provsize); +} +static __inline void +stripe_metadata_decode(const u_char *data, struct g_stripe_metadata *md) +{ + + bcopy(data, md->md_magic, sizeof(md->md_magic)); + md->md_version = le32dec(data + 16); + bcopy(data + 20, md->md_name, sizeof(md->md_name)); + md->md_id = le32dec(data + 36); + md->md_no = le16dec(data + 40); + md->md_all = le16dec(data + 42); + md->md_stripesize = le32dec(data + 44); + bcopy(data + 48, md->md_provider, sizeof(md->md_provider)); + md->md_provsize = le64dec(data + 64); +} + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_mirror.c b/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_mirror.c new file mode 100644 index 0000000000..176e117402 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_mirror.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../io.h" +#include "../../../util.h" +#include "../../../var.h" + +#include "g_mirror.h" + +/* not exposed */ +static void *meta_gmirror_alloc_struct(void); +static void meta_gmirror_encode(void *, void *); +static errno_t meta_gmirror_decode(const void *, void *); +static errno_t meta_gmirror_get_block(service_id_t, void **); +static errno_t meta_gmirror_write_block(service_id_t, const void *); + +static errno_t meta_gmirror_probe(service_id_t, void **); +static errno_t meta_gmirror_init_vol2meta(hr_volume_t *); +static errno_t meta_gmirror_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_gmirror_erase_block(service_id_t); +static bool meta_gmirror_compare_uuids(const void *, const void *); +static void meta_gmirror_inc_counter(hr_volume_t *); +static errno_t meta_gmirror_save(hr_volume_t *, bool); +static errno_t meta_gmirror_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_gmirror_get_devname(const void *); +static hr_level_t meta_gmirror_get_level(const void *); +static uint64_t meta_gmirror_get_data_offset(void); +static size_t meta_gmirror_get_size(void); +static uint8_t meta_gmirror_get_flags(void); +static hr_metadata_type_t meta_gmirror_get_type(void); +static void meta_gmirror_dump(const void *); + +hr_superblock_ops_t metadata_gmirror_ops = { + .probe = meta_gmirror_probe, + .init_vol2meta = meta_gmirror_init_vol2meta, + .init_meta2vol = meta_gmirror_init_meta2vol, + .erase_block = meta_gmirror_erase_block, + .compare_uuids = meta_gmirror_compare_uuids, + .inc_counter = meta_gmirror_inc_counter, + .save = meta_gmirror_save, + .save_ext = meta_gmirror_save_ext, + .get_devname = meta_gmirror_get_devname, + .get_level = meta_gmirror_get_level, + .get_data_offset = meta_gmirror_get_data_offset, + .get_size = meta_gmirror_get_size, + .get_flags = meta_gmirror_get_flags, + .get_type = meta_gmirror_get_type, + .dump = meta_gmirror_dump +}; + +static errno_t meta_gmirror_probe(service_id_t svc_id, void **rmd) +{ + errno_t rc; + void *meta_block; + + void *metadata_struct = meta_gmirror_alloc_struct(); + if (metadata_struct == NULL) + return ENOMEM; + + rc = meta_gmirror_get_block(svc_id, &meta_block); + if (rc != EOK) + goto error; + + rc = meta_gmirror_decode(meta_block, metadata_struct); + + free(meta_block); + + if (rc != EOK) + goto error; + + *rmd = metadata_struct; + return EOK; + +error: + free(metadata_struct); + return rc; +} + +static errno_t meta_gmirror_init_vol2meta(hr_volume_t *vol) +{ + (void)vol; + + return ENOTSUP; +} + +static errno_t meta_gmirror_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + struct g_mirror_metadata *main_meta = NULL; + uint64_t max_counter_val = 0; + + list_foreach(*list, link, struct dev_list_member, iter) { + struct g_mirror_metadata *iter_meta = iter->md; + + if (iter_meta->md_syncid >= max_counter_val) { + max_counter_val = iter_meta->md_syncid; + main_meta = iter_meta; + } + } + + assert(main_meta != NULL); + + vol->truncated_blkno = + main_meta->md_mediasize / main_meta->md_sectorsize; + + vol->data_blkno = vol->truncated_blkno - 1; + + vol->data_offset = 0; + + if (main_meta->md_all > HR_MAX_EXTENTS) { + HR_DEBUG("Assembled volume has %u extents (max = %u)", + (unsigned)main_meta->md_all, HR_MAX_EXTENTS); + rc = EINVAL; + goto error; + } + + vol->extent_no = main_meta->md_all; + + vol->layout = HR_LAYOUT_NONE; + + vol->strip_size = 0; + + vol->bsize = main_meta->md_sectorsize; + + vol->in_mem_md = + calloc(vol->extent_no, sizeof(struct g_mirror_metadata)); + if (vol->in_mem_md == NULL) + return ENOMEM; + memcpy(vol->in_mem_md, main_meta, sizeof(struct g_mirror_metadata)); + + bool rebuild_set = false; + + uint8_t index = 0; + list_foreach(*list, link, struct dev_list_member, iter) { + struct g_mirror_metadata *iter_meta = iter->md; + + struct g_mirror_metadata *p = + ((struct g_mirror_metadata *)vol->in_mem_md) + index; + memcpy(p, iter_meta, sizeof(*p)); + + vol->extents[index].svc_id = iter->svc_id; + + bool invalidate = false; + bool rebuild_this_ext = false; + + if (iter_meta->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) + invalidate = true; + if (iter_meta->md_syncid != max_counter_val) + invalidate = true; + + if (iter_meta->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING && + !invalidate) { + if (rebuild_set) { + HR_DEBUG("only 1 rebuilt extent allowed"); + rc = EINVAL; + goto error; + } + rebuild_set = true; + rebuild_this_ext = true; + vol->rebuild_blk = iter_meta->md_sync_offset; + } + + if (!rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_ONLINE; + else if (rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_REBUILD; + else + vol->extents[index].state = HR_EXT_INVALID; + + index++; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_NONE) + vol->extents[i].state = HR_EXT_MISSING; + } + +error: + return rc; +} + +static errno_t meta_gmirror_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + (void)dev; + + return ENOTSUP; +} + +static bool meta_gmirror_compare_uuids(const void *m1_v, const void *m2_v) +{ + const struct g_mirror_metadata *m1 = m1_v; + const struct g_mirror_metadata *m2 = m2_v; + if (m1->md_mid == m2->md_mid) + return true; + + return false; +} + +static void meta_gmirror_inc_counter(hr_volume_t *vol) +{ + fibril_mutex_lock(&vol->md_lock); + + for (size_t d = 0; d < vol->extent_no; d++) { + struct g_mirror_metadata *md = + ((struct g_mirror_metadata *)vol->in_mem_md) + d; + md->md_syncid++; + } + + fibril_mutex_unlock(&vol->md_lock); +} + +static errno_t meta_gmirror_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + fibril_rwlock_read_lock(&vol->extents_lock); + + for (size_t i = 0; i < vol->extent_no; i++) + meta_gmirror_save_ext(vol, i, with_state_callback); + + fibril_rwlock_read_unlock(&vol->extents_lock); + + return EOK; +} + +static errno_t meta_gmirror_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + void *md_block = hr_calloc_waitok(1, vol->bsize); + + struct g_mirror_metadata *md = + ((struct g_mirror_metadata *)vol->in_mem_md) + ext_idx; + + hr_extent_t *ext = &vol->extents[ext_idx]; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_ext_state_t s = ext->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (s != HR_EXT_ONLINE && s != HR_EXT_REBUILD) { + return EINVAL; + } + + fibril_mutex_lock(&vol->md_lock); + + if (s == HR_EXT_REBUILD) { + md->md_sync_offset = vol->rebuild_blk; + md->md_dflags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; + } else { + md->md_sync_offset = 0; + md->md_dflags &= ~(G_MIRROR_DISK_FLAG_SYNCHRONIZING); + } + + meta_gmirror_encode(md, md_block); + errno_t rc = meta_gmirror_write_block(ext->svc_id, md_block); + if (rc != EOK && with_state_callback) + vol->hr_ops.ext_state_cb(vol, ext_idx, rc); + + fibril_mutex_unlock(&vol->md_lock); + + if (with_state_callback) + vol->hr_ops.vol_state_eval(vol); + + free(md_block); + return EOK; +} + +static const char *meta_gmirror_get_devname(const void *md_v) +{ + const struct g_mirror_metadata *md = md_v; + + return md->md_name; +} + +static hr_level_t meta_gmirror_get_level(const void *md_v) +{ + (void)md_v; + + return HR_LVL_1; +} + +static uint64_t meta_gmirror_get_data_offset(void) +{ + return 0; +} + +static size_t meta_gmirror_get_size(void) +{ + return 1; +} + +static uint8_t meta_gmirror_get_flags(void) +{ + uint8_t flags = 0; + + return flags; +} + +static hr_metadata_type_t meta_gmirror_get_type(void) +{ + return HR_METADATA_GEOM_MIRROR; +} + +static void meta_gmirror_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + mirror_metadata_dump(md_v); +} + +static void *meta_gmirror_alloc_struct(void) +{ + return calloc(1, sizeof(struct g_mirror_metadata)); +} +static void meta_gmirror_encode(void *md_v, void *block) +{ + HR_DEBUG("%s()", __func__); + + mirror_metadata_encode(md_v, block); +} + +static errno_t meta_gmirror_decode(const void *block, void *md_v) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = mirror_metadata_decode(block, md_v); + return rc; +} + +static errno_t meta_gmirror_get_block(service_id_t dev, void **rblock) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + void *block; + + if (rblock == NULL) + return EINVAL; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(struct g_mirror_metadata)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < 1) + return EINVAL; + + block = malloc(bsize); + if (block == NULL) + return ENOMEM; + + rc = hr_read_direct(dev, blkno - 1, 1, block); + if (rc != EOK) { + free(block); + return rc; + } + + *rblock = block; + return EOK; +} + +static errno_t meta_gmirror_write_block(service_id_t dev, const void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(struct g_mirror_metadata)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < 1) + return EINVAL; + + rc = hr_write_direct(dev, blkno - 1, 1, block); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_stripe.c b/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_stripe.c new file mode 100644 index 0000000000..f173fc0e16 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/geom/hr_g_stripe.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../io.h" +#include "../../../util.h" +#include "../../../var.h" + +#include "g_stripe.h" + +/* not exposed */ +static void *meta_gstripe_alloc_struct(void); +/* static void meta_gstripe_encode(void *, void *); */ +static errno_t meta_gstripe_decode(const void *, void *); +static errno_t meta_gstripe_get_block(service_id_t, void **); +/* static errno_t meta_gstripe_write_block(service_id_t, const void *); */ + +static errno_t meta_gstripe_probe(service_id_t, void **); +static errno_t meta_gstripe_init_vol2meta(hr_volume_t *); +static errno_t meta_gstripe_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_gstripe_erase_block(service_id_t); +static bool meta_gstripe_compare_uuids(const void *, const void *); +static void meta_gstripe_inc_counter(hr_volume_t *); +static errno_t meta_gstripe_save(hr_volume_t *, bool); +static errno_t meta_gstripe_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_gstripe_get_devname(const void *); +static hr_level_t meta_gstripe_get_level(const void *); +static uint64_t meta_gstripe_get_data_offset(void); +static size_t meta_gstripe_get_size(void); +static uint8_t meta_gstripe_get_flags(void); +static hr_metadata_type_t meta_gstripe_get_type(void); +static void meta_gstripe_dump(const void *); + +hr_superblock_ops_t metadata_gstripe_ops = { + .probe = meta_gstripe_probe, + .init_vol2meta = meta_gstripe_init_vol2meta, + .init_meta2vol = meta_gstripe_init_meta2vol, + .erase_block = meta_gstripe_erase_block, + .compare_uuids = meta_gstripe_compare_uuids, + .inc_counter = meta_gstripe_inc_counter, + .save = meta_gstripe_save, + .save_ext = meta_gstripe_save_ext, + .get_devname = meta_gstripe_get_devname, + .get_level = meta_gstripe_get_level, + .get_data_offset = meta_gstripe_get_data_offset, + .get_size = meta_gstripe_get_size, + .get_flags = meta_gstripe_get_flags, + .get_type = meta_gstripe_get_type, + .dump = meta_gstripe_dump +}; + +static errno_t meta_gstripe_probe(service_id_t svc_id, void **rmd) +{ + errno_t rc; + void *meta_block; + + void *metadata_struct = meta_gstripe_alloc_struct(); + if (metadata_struct == NULL) + return ENOMEM; + + rc = meta_gstripe_get_block(svc_id, &meta_block); + if (rc != EOK) + goto error; + + rc = meta_gstripe_decode(meta_block, metadata_struct); + + free(meta_block); + + if (rc != EOK) + goto error; + + *rmd = metadata_struct; + return EOK; + +error: + free(metadata_struct); + return rc; +} + +static errno_t meta_gstripe_init_vol2meta(hr_volume_t *vol) +{ + (void)vol; + return ENOTSUP; +} + +static errno_t meta_gstripe_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + /* get bsize */ + size_t bsize; + struct dev_list_member *memb = list_get_instance(list_first(list), + struct dev_list_member, link); + rc = block_get_bsize(memb->svc_id, &bsize); + if (rc != EOK) + goto error; + + vol->bsize = bsize; + + uint64_t smallest_provider_size = ~0ULL; + struct g_stripe_metadata *main_meta = NULL; + + list_foreach(*list, link, struct dev_list_member, iter) { + struct g_stripe_metadata *iter_meta = iter->md; + + if (iter_meta->md_provsize < smallest_provider_size) { + smallest_provider_size = iter_meta->md_provsize; + main_meta = iter_meta; + } + } + + assert(main_meta != NULL); + + vol->truncated_blkno = + main_meta->md_provsize / bsize; + + vol->extent_no = main_meta->md_all; + + vol->data_blkno = (vol->truncated_blkno - 1) * vol->extent_no; + + vol->data_offset = 0; + + if (vol->extent_no > HR_MAX_EXTENTS) { + HR_DEBUG("Assembled volume has %u extents (max = %u)", + (unsigned)main_meta->md_all, HR_MAX_EXTENTS); + rc = EINVAL; + goto error; + } + + vol->strip_size = main_meta->md_stripesize; + + vol->layout = HR_LAYOUT_NONE; + + vol->in_mem_md = calloc(1, sizeof(struct g_stripe_metadata)); + if (vol->in_mem_md == NULL) + return ENOMEM; + memcpy(vol->in_mem_md, main_meta, sizeof(struct g_stripe_metadata)); + + list_foreach(*list, link, struct dev_list_member, iter) { + struct g_stripe_metadata *iter_meta = iter->md; + uint16_t index = iter_meta->md_no; + + vol->extents[index].svc_id = iter->svc_id; + + vol->extents[index].state = HR_EXT_ONLINE; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_NONE) + vol->extents[i].state = HR_EXT_MISSING; + } + +error: + return rc; +} + +static errno_t meta_gstripe_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + (void)dev; + + return ENOTSUP; +} + +static bool meta_gstripe_compare_uuids(const void *md1_v, const void *md2_v) +{ + const struct g_stripe_metadata *md1 = md1_v; + const struct g_stripe_metadata *md2 = md2_v; + if (md1->md_id == md2->md_id) + return true; + + return false; +} + +static void meta_gstripe_inc_counter(hr_volume_t *vol) +{ + (void)vol; +} + +static errno_t meta_gstripe_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + return ENOTSUP; +} + +static errno_t meta_gstripe_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + return ENOTSUP; +} + +static const char *meta_gstripe_get_devname(const void *md_v) +{ + const struct g_stripe_metadata *md = md_v; + + return md->md_name; +} + +static hr_level_t meta_gstripe_get_level(const void *md_v) +{ + (void)md_v; + + return HR_LVL_0; +} + +static uint64_t meta_gstripe_get_data_offset(void) +{ + return 0; +} + +static size_t meta_gstripe_get_size(void) +{ + return 1; +} + +static uint8_t meta_gstripe_get_flags(void) +{ + uint8_t flags = 0; + + return flags; +} + +static hr_metadata_type_t meta_gstripe_get_type(void) +{ + return HR_METADATA_GEOM_STRIPE; +} + +static void meta_gstripe_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + const struct g_stripe_metadata *md = md_v; + + printf(" magic: %s\n", md->md_magic); + printf(" version: %u\n", (u_int)md->md_version); + printf(" name: %s\n", md->md_name); + printf(" id: %u\n", (u_int)md->md_id); + printf(" no: %u\n", (u_int)md->md_no); + printf(" all: %u\n", (u_int)md->md_all); + printf("stripesize: %u\n", (u_int)md->md_stripesize); + printf(" mediasize: %jd\n", (intmax_t)md->md_provsize); +} + +static void *meta_gstripe_alloc_struct(void) +{ + return calloc(1, sizeof(struct g_stripe_metadata)); +} + +#if 0 +static void meta_gstripe_encode(void *md_v, void *block) +{ + HR_DEBUG("%s()", __func__); + + stripe_metadata_encode(md_v, block); +} +#endif + +static errno_t meta_gstripe_decode(const void *block, void *md_v) +{ + HR_DEBUG("%s()", __func__); + + struct g_stripe_metadata *md = md_v; + + stripe_metadata_decode(block, md); + + if (str_lcmp(md->md_magic, G_STRIPE_MAGIC, 16) != 0) + return EINVAL; + + return EOK; +} + +static errno_t meta_gstripe_get_block(service_id_t dev, void **rblock) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + void *block; + + if (rblock == NULL) + return EINVAL; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(struct g_stripe_metadata)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < 1) + return EINVAL; + + block = malloc(bsize); + if (block == NULL) + return ENOMEM; + + rc = hr_read_direct(dev, blkno - 1, 1, block); + if (rc != EOK) { + free(block); + return rc; + } + + *rblock = block; + return EOK; +} + +#if 0 +static errno_t meta_gstripe_write_block(service_id_t dev, const void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(struct g_stripe_metadata)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < 1) + return EINVAL; + + rc = hr_write_direct(dev, blkno - 1, 1, block); + + return rc; +} +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/geom/sys_endian.h b/uspace/srv/bd/hr/metadata/foreign/geom/sys_endian.h new file mode 100644 index 0000000000..e2e756c0ed --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/geom/sys_endian.h @@ -0,0 +1,139 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2002 Thomas Moestl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _HR_METADATA_FOREIGN_GEOM_SYS_ENDIAN_H +#define _HR_METADATA_FOREIGN_GEOM_SYS_ENDIAN_H + +/* Alignment-agnostic encode/decode bytestream to/from little/big endian. */ +static __inline uint16_t +be16dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return ((p[0] << 8) | p[1]); +} + +static __inline uint32_t +be32dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((unsigned)p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]); +} + +static __inline uint64_t +be64dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((uint64_t)be32dec(p) << 32) | be32dec(p + 4)); +} + +static __inline uint16_t +le16dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return ((p[1] << 8) | p[0]); +} + +static __inline uint32_t +le32dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((unsigned)p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]); +} + +static __inline uint64_t +le64dec(const void *pp) +{ + uint8_t const *p = (uint8_t const *)pp; + + return (((uint64_t)le32dec(p + 4) << 32) | le32dec(p)); +} + +static __inline void +be16enc(void *pp, uint16_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = (u >> 8) & 0xff; + p[1] = u & 0xff; +} + +static __inline void +be32enc(void *pp, uint32_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = (u >> 24) & 0xff; + p[1] = (u >> 16) & 0xff; + p[2] = (u >> 8) & 0xff; + p[3] = u & 0xff; +} + +static __inline void +be64enc(void *pp, uint64_t u) +{ + uint8_t *p = (uint8_t *)pp; + + be32enc(p, (uint32_t)(u >> 32)); + be32enc(p + 4, (uint32_t)(u & 0xffffffffU)); +} + +static __inline void +le16enc(void *pp, uint16_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; +} + +static __inline void +le32enc(void *pp, uint32_t u) +{ + uint8_t *p = (uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; + p[2] = (u >> 16) & 0xff; + p[3] = (u >> 24) & 0xff; +} + +static __inline void +le64enc(void *pp, uint64_t u) +{ + uint8_t *p = (uint8_t *)pp; + + le32enc(p, (uint32_t)(u & 0xffffffffU)); + le32enc(p + 4, (uint32_t)(u >> 32)); +} + +#endif diff --git a/uspace/srv/bd/hr/metadata/foreign/md/hr_md.c b/uspace/srv/bd/hr/metadata/foreign/md/hr_md.c new file mode 100644 index 0000000000..45533c26b9 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/md/hr_md.c @@ -0,0 +1,674 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../io.h" +#include "../../../util.h" +#include "../../../var.h" + +#include "md_p.h" + +/* not exposed */ +static void *meta_md_alloc_struct(void); +static void meta_md_encode(void *, void *); +static errno_t meta_md_decode(const void *, void *); +static errno_t meta_md_get_block(service_id_t, void **); +static errno_t meta_md_write_block(service_id_t, const void *); + +static errno_t meta_md_probe(service_id_t, void **); +static errno_t meta_md_init_vol2meta(hr_volume_t *); +static errno_t meta_md_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_md_erase_block(service_id_t); +static bool meta_md_compare_uuids(const void *, const void *); +static void meta_md_inc_counter(hr_volume_t *); +static errno_t meta_md_save(hr_volume_t *, bool); +static errno_t meta_md_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_md_get_devname(const void *); +static hr_level_t meta_md_get_level(const void *); +static uint64_t meta_md_get_data_offset(void); +static size_t meta_md_get_size(void); +static uint8_t meta_md_get_flags(void); +static hr_metadata_type_t meta_md_get_type(void); +static void meta_md_dump(const void *); + +hr_superblock_ops_t metadata_md_ops = { + .probe = meta_md_probe, + .init_vol2meta = meta_md_init_vol2meta, + .init_meta2vol = meta_md_init_meta2vol, + .erase_block = meta_md_erase_block, + .compare_uuids = meta_md_compare_uuids, + .inc_counter = meta_md_inc_counter, + .save = meta_md_save, + .save_ext = meta_md_save_ext, + .get_devname = meta_md_get_devname, + .get_level = meta_md_get_level, + .get_data_offset = meta_md_get_data_offset, + .get_size = meta_md_get_size, + .get_flags = meta_md_get_flags, + .get_type = meta_md_get_type, + .dump = meta_md_dump +}; + +static errno_t meta_md_probe(service_id_t svc_id, void **rmd) +{ + errno_t rc; + void *meta_block; + + void *metadata_struct = meta_md_alloc_struct(); + if (metadata_struct == NULL) + return ENOMEM; + + rc = meta_md_get_block(svc_id, &meta_block); + if (rc != EOK) + goto error; + + rc = meta_md_decode(meta_block, metadata_struct); + + free(meta_block); + + if (rc != EOK) + goto error; + + *rmd = metadata_struct; + return EOK; + +error: + free(metadata_struct); + return rc; +} + +static errno_t meta_md_init_vol2meta(hr_volume_t *vol) +{ + (void)vol; + + return ENOTSUP; +} + +static errno_t meta_md_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + struct mdp_superblock_1 *main_meta = NULL; + uint64_t max_events = 0; + + list_foreach(*list, link, struct dev_list_member, iter) { + struct mdp_superblock_1 *iter_meta = iter->md; + + if (iter_meta->events >= max_events) { + max_events = iter_meta->events; + main_meta = iter_meta; + } + } + + assert(main_meta != NULL); + + vol->bsize = 512; + + vol->truncated_blkno = main_meta->size; + + vol->extent_no = main_meta->raid_disks; + + switch (vol->level) { + case HR_LVL_0: + vol->data_blkno = vol->truncated_blkno * vol->extent_no; + vol->layout = HR_LAYOUT_NONE; + break; + case HR_LVL_1: + vol->data_blkno = vol->truncated_blkno; + vol->layout = HR_LAYOUT_NONE; + break; + case HR_LVL_4: + vol->data_blkno = vol->truncated_blkno * (vol->extent_no - 1); + vol->layout = HR_LAYOUT_RAID4_N; + break; + case HR_LVL_5: + vol->data_blkno = vol->truncated_blkno * (vol->extent_no - 1); + switch (main_meta->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + vol->layout = HR_LAYOUT_RAID5_NR; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + vol->layout = HR_LAYOUT_RAID5_0R; + break; + case ALGORITHM_LEFT_SYMMETRIC: + vol->layout = HR_LAYOUT_RAID5_NC; + break; + } + break; + default: + return EINVAL; + } + + vol->data_offset = main_meta->data_offset; + + vol->strip_size = main_meta->chunksize * 512; + + vol->in_mem_md = calloc(vol->extent_no, MD_SIZE * 512); + if (vol->in_mem_md == NULL) + return ENOMEM; + + bool rebuild_set = false; + size_t i = 0; + list_foreach(*list, link, struct dev_list_member, iter) { + struct mdp_superblock_1 *iter_meta = iter->md; + + uint8_t index = iter_meta->dev_roles[iter_meta->dev_number]; + + struct mdp_superblock_1 *p = (struct mdp_superblock_1 *) + (((char *)vol->in_mem_md) + MD_SIZE * 512 * index); + memcpy(p, iter_meta, MD_SIZE * 512); + + vol->extents[index].svc_id = iter->svc_id; + + bool invalidate = false; + bool rebuild_this_ext = false; + + if (iter_meta->events != max_events) + invalidate = true; + + if (iter_meta->feature_map & MD_DISK_SYNC && !invalidate) { + if (rebuild_set) { + HR_DEBUG("only 1 rebuilt extent allowed"); + rc = EINVAL; + goto error; + } + rebuild_set = true; + rebuild_this_ext = true; + vol->rebuild_blk = iter_meta->resync_offset; + } + + if (!rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_ONLINE; + else if (rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_REBUILD; + else + vol->extents[index].state = HR_EXT_INVALID; + + i++; + if (i == vol->extent_no) + break; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_NONE) + vol->extents[i].state = HR_EXT_MISSING; + } + +error: + return rc; +} + +static errno_t meta_md_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + (void)dev; + + return ENOTSUP; +} + +static bool meta_md_compare_uuids(const void *m1_v, const void *m2_v) +{ + const struct mdp_superblock_1 *m1 = m1_v; + const struct mdp_superblock_1 *m2 = m2_v; + if (memcmp(&m1->set_uuid, &m2->set_uuid, 16) == 0) + return true; + + return false; +} + +static void meta_md_inc_counter(hr_volume_t *vol) +{ + fibril_mutex_lock(&vol->md_lock); + + for (size_t d = 0; d < vol->extent_no; d++) { + struct mdp_superblock_1 *md = (struct mdp_superblock_1 *) + (((uint8_t *)vol->in_mem_md) + MD_SIZE * 512 * d); + md->events++; + } + + fibril_mutex_unlock(&vol->md_lock); +} + +static errno_t meta_md_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + fibril_rwlock_read_lock(&vol->extents_lock); + + for (size_t i = 0; i < vol->extent_no; i++) + meta_md_save_ext(vol, i, with_state_callback); + + fibril_rwlock_read_unlock(&vol->extents_lock); + + return EOK; +} + +static errno_t meta_md_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + void *md_block = hr_calloc_waitok(1, MD_SIZE * 512); + + struct mdp_superblock_1 *md = (struct mdp_superblock_1 *) + (((uint8_t *)vol->in_mem_md) + MD_SIZE * 512 * ext_idx); + + hr_extent_t *ext = &vol->extents[ext_idx]; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_ext_state_t s = ext->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (s != HR_EXT_ONLINE && s != HR_EXT_REBUILD) { + return EINVAL; + } + + fibril_mutex_lock(&vol->md_lock); + + if (s == HR_EXT_REBUILD) { + md->resync_offset = vol->rebuild_blk; + md->feature_map = MD_DISK_SYNC; + } else { + md->resync_offset = 0; + md->feature_map = 0; + } + + meta_md_encode(md, md_block); + errno_t rc = meta_md_write_block(ext->svc_id, md_block); + if (rc != EOK && with_state_callback) + vol->hr_ops.ext_state_cb(vol, ext_idx, rc); + + fibril_mutex_unlock(&vol->md_lock); + + if (with_state_callback) + vol->hr_ops.vol_state_eval(vol); + + free(md_block); + + return EOK; +} + +static const char *meta_md_get_devname(const void *md_v) +{ + const struct mdp_superblock_1 *md = md_v; + + return md->set_name; +} + +static hr_level_t meta_md_get_level(const void *md_v) +{ + const struct mdp_superblock_1 *md = md_v; + + switch (md->level) { + case 0: + return HR_LVL_0; + case 1: + return HR_LVL_1; + case 4: + return HR_LVL_4; + case 5: + return HR_LVL_5; + default: + return HR_LVL_UNKNOWN; + } +} + +static uint64_t meta_md_get_data_offset(void) +{ + return MD_DATA_OFFSET; +} + +static size_t meta_md_get_size(void) +{ + return MD_SIZE; +} + +static uint8_t meta_md_get_flags(void) +{ + uint8_t flags = 0; + + return flags; +} + +static hr_metadata_type_t meta_md_get_type(void) +{ + return HR_METADATA_MD; +} + +static void bytefield_print(const uint8_t *d, size_t s) +{ + size_t i; + + for (i = 0; i < s; i++) + printf("%02x", d[i]); +} + +static void meta_md_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + const struct mdp_superblock_1 *md = md_v; + + printf("magic: 0x%" PRIx32 "\n", md->magic); + + printf("major_version: %" PRIu32 "\n", md->major_version); + + printf("feature_map: 0x%" PRIx32 "\n", md->feature_map); + + printf("set_uuid: "); + bytefield_print(md->set_uuid, 16); + printf("\n"); + + printf("set_name: %s\n", md->set_name); + + printf("level: %" PRIi32 "\n", md->level); + + printf("layout: %" PRIi32 "\n", md->layout); + + printf("size: %" PRIu64 "\n", md->size); + + printf("chunksize: %" PRIu32 "\n", md->chunksize); + + printf("raid_disks: %" PRIu32 "\n", md->raid_disks); + + printf("data_offset: %" PRIu64 "\n", md->data_offset); + + printf("data_size: %" PRIu64 "\n", md->data_size); + + printf("super_offset: %" PRIu64 "\n", md->super_offset); + + printf("dev_number: %" PRIu32 "\n", md->dev_number); + + printf("device_uuid: "); + bytefield_print(md->device_uuid, 16); + printf("\n"); + + printf("events: %" PRIu64 "\n", md->events); + + if (md->resync_offset == ~(0ULL)) + printf("resync_offset: 0\n"); + else + printf("resync_offset: %" PRIu64 "\n", md->resync_offset); + + printf("max_dev: %" PRIu32 "\n", md->max_dev); + + printf("dev_roles: "); + for (uint32_t d = 0; d < md->max_dev; d++) { + printf("0x%" PRIx16, md->dev_roles[d]); + if (d + 1 < md->max_dev) + printf(", "); + } + printf("\n"); +} + +static void *meta_md_alloc_struct(void) +{ + /* 1KiB - 256 meta size + max 384 devices */ + return calloc(1, MD_SIZE * 512); +} + +static void meta_md_encode(void *md_v, void *block) +{ + HR_DEBUG("%s()", __func__); + + memcpy(block, md_v, meta_md_get_size() * 512); + + struct mdp_superblock_1 *md = block; + + md->magic = host2uint32_t_le(md->magic); + md->major_version = host2uint32_t_le(md->major_version); + md->feature_map = host2uint32_t_le(md->feature_map); + md->level = host2uint32_t_le(md->level); + md->layout = host2uint32_t_le(md->layout); + md->size = host2uint64_t_le(md->size); + md->chunksize = host2uint32_t_le(md->chunksize); + md->raid_disks = host2uint32_t_le(md->raid_disks); + md->data_offset = host2uint64_t_le(md->data_offset); + md->data_size = host2uint64_t_le(md->data_size); + md->super_offset = host2uint64_t_le(md->super_offset); + md->dev_number = host2uint32_t_le(md->dev_number); + md->events = host2uint64_t_le(md->events); + md->resync_offset = host2uint64_t_le(md->resync_offset); + md->max_dev = host2uint32_t_le(md->max_dev); + for (uint32_t d = 0; d < md->max_dev; d++) + md->dev_roles[d] = host2uint16_t_le(md->dev_roles[d]); + + md->sb_csum = calc_sb_1_csum(md); +} + +static errno_t meta_md_decode(const void *block, void *md_v) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + struct mdp_superblock_1 *md = md_v; + + /* + * Do in-place decoding to cpu byte order. + * We do it like this because: + * 1) we do not know what is after the 256 bytes + * of the struct, so we write back what was there + * previously, + * 2) we do not want to deal unused fields such + * as unions and so on. + */ + memcpy(md, block, meta_md_get_size() * 512); + + md->magic = uint32_t_le2host(md->magic); + if (md->magic != MD_MAGIC) { + rc = EINVAL; + goto error; + } + + md->major_version = uint32_t_le2host(md->major_version); + if (md->major_version != 1) { + HR_DEBUG("unsupported metadata version\n"); + rc = EINVAL; + goto error; + } + + md->feature_map = uint32_t_le2host(md->feature_map); + if (md->feature_map != 0x0 && md->feature_map != MD_DISK_SYNC) { + HR_DEBUG("unsupported feature map bits\n"); + rc = EINVAL; + goto error; + } + + md->level = uint32_t_le2host(md->level); + switch (md->level) { + case 0: + case 1: + case 4: + case 5: + break; + default: + HR_DEBUG("unsupported level\n"); + rc = EINVAL; + goto error; + } + + md->layout = uint32_t_le2host(md->layout); + if (md->level == 5) { + switch (md->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + case ALGORITHM_LEFT_SYMMETRIC: + break; + default: + HR_DEBUG("unsupported layout\n"); + rc = EINVAL; + goto error; + } + } else if (md->level == 4) { + if (md->layout != 0) { + HR_DEBUG("unsupported layout\n"); + rc = EINVAL; + goto error; + } + } + + md->size = uint64_t_le2host(md->size); + + md->chunksize = uint32_t_le2host(md->chunksize); + + md->raid_disks = uint32_t_le2host(md->raid_disks); + if (md->raid_disks > HR_MAX_EXTENTS) { + rc = EINVAL; + goto error; + } + + md->data_offset = uint64_t_le2host(md->data_offset); + + md->data_size = uint64_t_le2host(md->data_size); + if (md->data_size != md->size) { + rc = EINVAL; + goto error; + } + + md->super_offset = uint64_t_le2host(md->super_offset); + if (md->super_offset != MD_OFFSET) { + rc = EINVAL; + goto error; + } + + md->dev_number = uint32_t_le2host(md->dev_number); + + md->events = uint64_t_le2host(md->events); + + md->resync_offset = uint64_t_le2host(md->resync_offset); + if (md->feature_map == 0 && md->resync_offset != ~(0ULL)) { + rc = EINVAL; + goto error; + } + + md->max_dev = uint32_t_le2host(md->max_dev); + if (md->max_dev > 256 + 128) { + rc = EINVAL; + goto error; + } + + for (uint32_t d = 0; d < md->max_dev; d++) + md->dev_roles[d] = uint16_t_le2host(md->dev_roles[d]); + +error: + return rc; +} + +static errno_t meta_md_get_block(service_id_t dev, void **rblock) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + void *block; + + if (rblock == NULL) + return EINVAL; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize != 512) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < MD_OFFSET + MD_SIZE) + return EINVAL; + + block = malloc(bsize * MD_SIZE); + if (block == NULL) + return ENOMEM; + + rc = hr_read_direct(dev, MD_OFFSET, MD_SIZE, block); + if (rc != EOK) { + free(block); + return rc; + } + + *rblock = block; + return EOK; +} + +static errno_t meta_md_write_block(service_id_t dev, const void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize != 512) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < MD_OFFSET + MD_SIZE) + return EINVAL; + + rc = hr_write_direct(dev, MD_OFFSET, MD_SIZE, block); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/md/md_p.h b/uspace/srv/bd/hr/metadata/foreign/md/md_p.h new file mode 100644 index 0000000000..0f4fa22514 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/md/md_p.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * md_p.h : physical layout of Linux RAID devices + * Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_METADATA_FOREIGN_MD_H +#define _HR_METADATA_FOREIGN_MD_H + +typedef uint64_t __le64; +typedef uint32_t __le32; +typedef uint16_t __le16; +typedef uint8_t __u8; + +/* in 512 blocks */ +#define MD_OFFSET 8 +#define MD_SIZE 2 + +/* this is actually not used when assembling */ +#define MD_DATA_OFFSET 2048 + +#define MD_MAGIC 0xa92b4efc + +#define MD_DISK_SYNC 2 + +/* + * The version-1 superblock : + * All numeric fields are little-endian. + * + * total size: 256 bytes plus 2 per device. + * 1K allows 384 devices. + */ +struct mdp_superblock_1 { + /* constant array information - 128 bytes */ + __le32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ + __le32 major_version; /* 1 */ + __le32 feature_map; /* bit 0 set if 'bitmap_offset' is meaningful */ + __le32 pad0; /* always set to 0 when writing */ + + __u8 set_uuid[16]; /* user-space generated. */ + char set_name[32]; /* set and interpreted by user-space */ + + __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0 */ + __le32 level; /* 0,1,4,5, -1 (linear) */ + __le32 layout; /* only for raid5 and raid10 currently */ + __le64 size; /* used size of component devices, in 512byte sectors */ + + __le32 chunksize; /* in 512byte sectors */ + __le32 raid_disks; + union { + __le32 bitmap_offset; + /* + * sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* only meaningful when feature_map[MD_FEATURE_PPL] is set */ + struct { + __le16 offset; /* sectors from start of superblock that ppl starts (signed) */ + __le16 size; /* ppl size in sectors */ + } ppl; + }; + + /* These are only valid with feature bit '4' */ + __le32 new_level; /* new level we are reshaping to */ + __le64 reshape_position; /* next address in array-space for reshape */ + __le32 delta_disks; /* change in number of raid_disks */ + __le32 new_layout; /* new layout */ + __le32 new_chunk; /* new chunk size (512byte sectors) */ + __le32 new_offset; + /* + * signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ + + /* constant this-device information - 64 bytes */ + __le64 data_offset; /* sector start of data, often 0 */ + __le64 data_size; /* sectors in this device that can be used for data */ + __le64 super_offset; /* sector start of this superblock */ + union { + __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __le64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; + __le32 dev_number; /* permanent identifier of this device - not role in raid */ + __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ + __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ + __u8 devflags; /* per-device flags. Only two defined... */ +#define WriteMostly1 1 /* mask for writemostly flag in above */ +#define FailFast1 2 /* Should avoid retries and fixups and just fail */ + /* + * Bad block log. If there are any bad blocks the feature flag is set. + * If offset and size are non-zero, that space is reserved and available + */ + __u8 bblog_shift; /* shift from sectors to block size */ + __le16 bblog_size; /* number of sectors reserved for list */ + __le32 bblog_offset; + /* + * sector offset from superblock to bblog, + * signed - not unsigned + */ + + /* array state information - 64 bytes */ + __le64 utime; /* 40 bits second, 24 bits microseconds */ + __le64 events; /* incremented when superblock updated */ + __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ + __le32 sb_csum; /* checksum up to devs[max_dev] */ + __le32 max_dev; /* size of devs[] array to consider */ + __u8 pad3[64 - 32]; /* set to 0 when writing */ + + /* + * device state information. Indexed by dev_number. + * 2 bytes per device + * Note there are no per-device state flags. State information is rolled + * into the 'roles' value. If a device is spare or faulty, then it doesn't + * have a meaningful role. + */ + __le16 dev_roles[]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ +}; + +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +/* from mdadm - mdadm.h */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 + +/* from mdadm - super1.c */ +static inline unsigned int calc_sb_1_csum(struct mdp_superblock_1 *sb) +{ + unsigned int disk_csum, csum; + unsigned long long newcsum; + int size = sizeof(*sb) + uint32_t_le2host(sb->max_dev) * 2; + unsigned int *isuper = (unsigned int *)sb; + + /* make sure I can count... */ + if (offsetof(struct mdp_superblock_1, data_offset) != 128 || + offsetof(struct mdp_superblock_1, utime) != 192 || + sizeof(struct mdp_superblock_1) != 256) { + fprintf(stderr, "WARNING - superblock isn't sized correctly\n"); + } + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (; size >= 4; size -= 4) { + newcsum += uint32_t_le2host(*isuper); + isuper++; + } + + if (size == 2) + newcsum += uint32_t_le2host(*(unsigned short *) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return host2uint32_t_le(csum); +} + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/softraid/hr_softraid.c b/uspace/srv/bd/hr/metadata/foreign/softraid/hr_softraid.c new file mode 100644 index 0000000000..c1cb5415a5 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/softraid/hr_softraid.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../io.h" +#include "../../../util.h" +#include "../../../var.h" + +#include "softraidvar.h" + +/* not exposed */ +static void *meta_softraid_alloc_struct(void); +static void meta_softraid_encode(void *, void *); +static errno_t meta_softraid_decode(const void *, void *); +static errno_t meta_softraid_get_block(service_id_t, void **); +static errno_t meta_softraid_write_block(service_id_t, const void *); + +static errno_t meta_softraid_probe(service_id_t, void **); +static errno_t meta_softraid_init_vol2meta(hr_volume_t *); +static errno_t meta_softraid_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_softraid_erase_block(service_id_t); +static bool meta_softraid_compare_uuids(const void *, const void *); +static void meta_softraid_inc_counter(hr_volume_t *); +static errno_t meta_softraid_save(hr_volume_t *, bool); +static errno_t meta_softraid_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_softraid_get_devname(const void *); +static hr_level_t meta_softraid_get_level(const void *); +static uint64_t meta_softraid_get_data_offset(void); +static size_t meta_softraid_get_size(void); +static uint8_t meta_softraid_get_flags(void); +static hr_metadata_type_t meta_softraid_get_type(void); +static void meta_softraid_dump(const void *); + +hr_superblock_ops_t metadata_softraid_ops = { + .probe = meta_softraid_probe, + .init_vol2meta = meta_softraid_init_vol2meta, + .init_meta2vol = meta_softraid_init_meta2vol, + .erase_block = meta_softraid_erase_block, + .compare_uuids = meta_softraid_compare_uuids, + .inc_counter = meta_softraid_inc_counter, + .save = meta_softraid_save, + .save_ext = meta_softraid_save_ext, + .get_devname = meta_softraid_get_devname, + .get_level = meta_softraid_get_level, + .get_data_offset = meta_softraid_get_data_offset, + .get_size = meta_softraid_get_size, + .get_flags = meta_softraid_get_flags, + .get_type = meta_softraid_get_type, + .dump = meta_softraid_dump +}; + +static errno_t meta_softraid_probe(service_id_t svc_id, void **rmd) +{ + errno_t rc; + void *meta_block; + + void *metadata_struct = meta_softraid_alloc_struct(); + if (metadata_struct == NULL) + return ENOMEM; + + rc = meta_softraid_get_block(svc_id, &meta_block); + if (rc != EOK) + goto error; + + rc = meta_softraid_decode(meta_block, metadata_struct); + + free(meta_block); + + if (rc != EOK) + goto error; + + *rmd = metadata_struct; + return EOK; + +error: + free(metadata_struct); + return rc; +} + +static errno_t meta_softraid_init_vol2meta(hr_volume_t *vol) +{ + (void)vol; + + return ENOTSUP; +} + +static errno_t meta_softraid_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + struct sr_metadata *main_meta = NULL; + uint64_t max_counter_val = 0; + + list_foreach(*list, link, struct dev_list_member, iter) { + struct sr_metadata *iter_meta = iter->md; + + if (iter_meta->ssd_ondisk >= max_counter_val) { + max_counter_val = iter_meta->ssd_ondisk; + main_meta = iter_meta; + } + } + + assert(main_meta != NULL); + + vol->bsize = main_meta->ssdi.ssd_secsize; + + vol->data_blkno = main_meta->ssdi.ssd_size; + + /* get coerced size from some (first) chunk metadata */ + struct sr_meta_chunk *mc = (struct sr_meta_chunk *)(main_meta + 1); + vol->truncated_blkno = mc->scmi.scm_coerced_size; + + vol->data_offset = main_meta->ssd_data_blkno; + + if (main_meta->ssdi.ssd_chunk_no > HR_MAX_EXTENTS) { + HR_DEBUG("Assembled volume has %u extents (max = %u)", + (unsigned)main_meta->ssdi.ssd_chunk_no, + HR_MAX_EXTENTS); + rc = EINVAL; + goto error; + } + + vol->extent_no = main_meta->ssdi.ssd_chunk_no; + + if (main_meta->ssdi.ssd_level == 5) + vol->layout = HR_LAYOUT_RAID5_NR; + else + vol->layout = HR_LAYOUT_NONE; + + vol->strip_size = main_meta->ssdi.ssd_strip_size; + + vol->in_mem_md = calloc(1, SR_META_SIZE * DEV_BSIZE); + if (vol->in_mem_md == NULL) + return ENOMEM; + memcpy(vol->in_mem_md, main_meta, SR_META_SIZE * DEV_BSIZE); + + bool rebuild_set = false; + list_foreach(*list, link, struct dev_list_member, iter) { + struct sr_metadata *iter_meta = iter->md; + + uint8_t index = iter_meta->ssdi.ssd_chunk_id; + + vol->extents[index].svc_id = iter->svc_id; + + struct sr_meta_chunk *mc = + ((struct sr_meta_chunk *)(main_meta + 1)) + index; + + bool invalidate = false; + bool rebuild_this_ext = false; + + if (iter_meta->ssd_meta_flags & SR_META_DIRTY) + invalidate = true; + if (iter_meta->ssd_ondisk != max_counter_val) + invalidate = true; + + if (mc->scm_status == BIOC_SDREBUILD && !invalidate) { + if (rebuild_set) { + HR_DEBUG("only 1 rebuilt extent allowed"); + rc = EINVAL; + goto error; + } + rebuild_set = true; + rebuild_this_ext = true; + vol->rebuild_blk = iter_meta->ssd_rebuild; + } + + if (!rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_ONLINE; + else if (rebuild_this_ext && !invalidate) + vol->extents[index].state = HR_EXT_REBUILD; + else + vol->extents[index].state = HR_EXT_INVALID; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_NONE) + vol->extents[i].state = HR_EXT_MISSING; + } + +error: + return rc; +} + +static errno_t meta_softraid_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + (void)dev; + + return ENOTSUP; +} + +static bool meta_softraid_compare_uuids(const void *m1_v, const void *m2_v) +{ + const struct sr_metadata *m1 = m1_v; + const struct sr_metadata *m2 = m2_v; + if (memcmp(&m1->ssdi.ssd_uuid, &m2->ssdi.ssd_uuid, SR_UUID_MAX) == 0) + return true; + + return false; +} + +static void meta_softraid_inc_counter(hr_volume_t *vol) +{ + fibril_mutex_lock(&vol->md_lock); + + struct sr_metadata *md = vol->in_mem_md; + + md->ssd_ondisk++; + + fibril_mutex_unlock(&vol->md_lock); +} + +static errno_t meta_softraid_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + fibril_rwlock_read_lock(&vol->extents_lock); + + for (size_t i = 0; i < vol->extent_no; i++) + meta_softraid_save_ext(vol, i, with_state_callback); + + fibril_rwlock_read_unlock(&vol->extents_lock); + + return EOK; +} + +static errno_t meta_softraid_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + void *md_block = hr_calloc_waitok(1, vol->bsize * SR_META_SIZE); + + struct sr_metadata *md = vol->in_mem_md; + struct sr_meta_chunk *mc = + ((struct sr_meta_chunk *)(md + 1)) + ext_idx; + + hr_extent_t *ext = &vol->extents[ext_idx]; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_ext_state_t s = ext->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (s != HR_EXT_ONLINE && s != HR_EXT_REBUILD) { + return EINVAL; + } + + fibril_mutex_lock(&vol->md_lock); + + md->ssd_rebuild = vol->rebuild_blk; + md->ssdi.ssd_chunk_id = ext_idx; + + if (s == HR_EXT_REBUILD) + mc->scm_status = BIOC_SDREBUILD; + else + mc->scm_status = BIOC_SDONLINE; + + meta_softraid_encode(md, md_block); + errno_t rc = meta_softraid_write_block(ext->svc_id, md_block); + if (rc != EOK && with_state_callback) + vol->hr_ops.ext_state_cb(vol, ext_idx, rc); + + fibril_mutex_unlock(&vol->md_lock); + + if (with_state_callback) + vol->hr_ops.vol_state_eval(vol); + + free(md_block); + return EOK; +} + +static const char *meta_softraid_get_devname(const void *md_v) +{ + const struct sr_metadata *md = md_v; + + return md->ssd_devname; +} + +static hr_level_t meta_softraid_get_level(const void *md_v) +{ + const struct sr_metadata *md = md_v; + + switch (md->ssdi.ssd_level) { + case 0: + return HR_LVL_0; + case 1: + return HR_LVL_1; + case 5: + return HR_LVL_5; + default: + return HR_LVL_UNKNOWN; + } +} + +static uint64_t meta_softraid_get_data_offset(void) +{ + return SR_DATA_OFFSET; +} + +static size_t meta_softraid_get_size(void) +{ + return SR_META_SIZE; +} + +static uint8_t meta_softraid_get_flags(void) +{ + uint8_t flags = 0; + + flags |= HR_METADATA_ALLOW_REBUILD; + + return flags; +} + +static hr_metadata_type_t meta_softraid_get_type(void) +{ + return HR_METADATA_SOFTRAID; +} + +static void meta_softraid_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + const struct sr_metadata *md = md_v; + + sr_meta_print(md); +} + +static void *meta_softraid_alloc_struct(void) +{ + return calloc(1, SR_META_SIZE * DEV_BSIZE); +} + +static void meta_softraid_encode(void *md_v, void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + struct sr_metadata *md = md_v; + uint8_t md5_hash[16]; + + struct sr_metadata *scratch_md = + hr_calloc_waitok(1, SR_META_SIZE * DEV_BSIZE); + + scratch_md->ssdi.ssd_magic = host2uint64_t_le(md->ssdi.ssd_magic); + scratch_md->ssdi.ssd_version = host2uint32_t_le(md->ssdi.ssd_version); + scratch_md->ssdi.ssd_vol_flags = + host2uint32_t_le(md->ssdi.ssd_vol_flags); + memcpy(&scratch_md->ssdi.ssd_uuid, &md->ssdi.ssd_uuid, SR_UUID_MAX); + scratch_md->ssdi.ssd_chunk_no = + host2uint32_t_le(md->ssdi.ssd_chunk_no); + scratch_md->ssdi.ssd_chunk_id = + host2uint32_t_le(md->ssdi.ssd_chunk_id); + scratch_md->ssdi.ssd_opt_no = host2uint32_t_le(md->ssdi.ssd_opt_no); + scratch_md->ssdi.ssd_secsize = host2uint32_t_le(md->ssdi.ssd_secsize); + scratch_md->ssdi.ssd_volid = host2uint32_t_le(md->ssdi.ssd_volid); + scratch_md->ssdi.ssd_level = host2uint32_t_le(md->ssdi.ssd_level); + scratch_md->ssdi.ssd_size = host2int64_t_le(md->ssdi.ssd_size); + memcpy(scratch_md->ssdi.ssd_vendor, md->ssdi.ssd_vendor, 8); + memcpy(scratch_md->ssdi.ssd_product, md->ssdi.ssd_product, 16); + memcpy(scratch_md->ssdi.ssd_revision, md->ssdi.ssd_revision, 4); + scratch_md->ssdi.ssd_strip_size = + host2uint32_t_le(md->ssdi.ssd_strip_size); + rc = create_hash((const uint8_t *)&scratch_md->ssdi, + sizeof(struct sr_meta_invariant), md5_hash, HASH_MD5); + assert(rc == EOK); + memcpy(scratch_md->ssd_checksum, md5_hash, MD5_DIGEST_LENGTH); + + memcpy(scratch_md->ssd_devname, md->ssd_devname, 32); + + scratch_md->ssd_meta_flags = host2uint32_t_le(md->ssd_meta_flags); + scratch_md->ssd_data_blkno = host2uint32_t_le(md->ssd_data_blkno); + scratch_md->ssd_ondisk = host2uint64_t_le(md->ssd_ondisk); + scratch_md->ssd_rebuild = host2int64_t_le(md->ssd_rebuild); + + struct sr_meta_chunk *scratch_mc = + (struct sr_meta_chunk *)(scratch_md + 1); + struct sr_meta_chunk *mc = (struct sr_meta_chunk *)(md + 1); + for (size_t i = 0; i < md->ssdi.ssd_chunk_no; i++, mc++, scratch_mc++) { + scratch_mc->scmi.scm_volid = + host2uint32_t_le(mc->scmi.scm_volid); + scratch_mc->scmi.scm_chunk_id = + host2uint32_t_le(mc->scmi.scm_chunk_id); + memcpy(scratch_mc->scmi.scm_devname, mc->scmi.scm_devname, 32); + scratch_mc->scmi.scm_size = host2int64_t_le(mc->scmi.scm_size); + scratch_mc->scmi.scm_coerced_size = + host2int64_t_le(mc->scmi.scm_coerced_size); + memcpy(&scratch_mc->scmi.scm_uuid, &mc->scmi.scm_uuid, + SR_UUID_MAX); + + rc = create_hash((const uint8_t *)&scratch_mc->scmi, + sizeof(struct sr_meta_chunk_invariant), md5_hash, HASH_MD5); + assert(rc == EOK); + + memcpy(scratch_mc->scm_checksum, md5_hash, + MD5_DIGEST_LENGTH); + scratch_mc->scm_status = host2uint32_t_le(mc->scm_status); + } + + struct sr_meta_opt_hdr *scratch_om = + (struct sr_meta_opt_hdr *)((u_int8_t *)(scratch_md + 1) + + sizeof(struct sr_meta_chunk) * md->ssdi.ssd_chunk_no); + struct sr_meta_opt_hdr *om = + (struct sr_meta_opt_hdr *)((u_int8_t *)(md + 1) + + sizeof(struct sr_meta_chunk) * md->ssdi.ssd_chunk_no); + for (size_t i = 0; i < md->ssdi.ssd_opt_no; i++) { + scratch_om->som_type = host2uint32_t_le(om->som_type); + scratch_om->som_length = host2uint32_t_le(om->som_length); + memcpy(scratch_om->som_checksum, om->som_checksum, + MD5_DIGEST_LENGTH); + + /* + * No need to do checksum, we don't support optional headers. + * Despite this, still load it the headers. + */ + + om = (struct sr_meta_opt_hdr *)((void *)om + + om->som_length); + scratch_om = (struct sr_meta_opt_hdr *)((void *)scratch_om + + om->som_length); + } + + memcpy(block, scratch_md, meta_softraid_get_size() * 512); + + free(scratch_md); +} + +static errno_t meta_softraid_decode(const void *block, void *md_v) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + struct sr_metadata *md = md_v; + uint8_t md5_hash[16]; + + struct sr_metadata *scratch_md = meta_softraid_alloc_struct(); + if (scratch_md == NULL) + return ENOMEM; + + memcpy(scratch_md, block, meta_softraid_get_size() * 512); + + md->ssdi.ssd_magic = + uint64_t_le2host(scratch_md->ssdi.ssd_magic); + if (md->ssdi.ssd_magic != SR_MAGIC) { + rc = EINVAL; + goto error; + } + md->ssdi.ssd_version = uint32_t_le2host(scratch_md->ssdi.ssd_version); + if (md->ssdi.ssd_version != SR_META_VERSION) { + HR_DEBUG("unsupported metadata version\n"); + rc = EINVAL; + goto error; + } + md->ssdi.ssd_vol_flags = + uint32_t_le2host(scratch_md->ssdi.ssd_vol_flags); + memcpy(&md->ssdi.ssd_uuid, &scratch_md->ssdi.ssd_uuid, SR_UUID_MAX); + + md->ssdi.ssd_chunk_no = + uint32_t_le2host(scratch_md->ssdi.ssd_chunk_no); + md->ssdi.ssd_chunk_id = + uint32_t_le2host(scratch_md->ssdi.ssd_chunk_id); + + md->ssdi.ssd_opt_no = uint32_t_le2host(scratch_md->ssdi.ssd_opt_no); + if (md->ssdi.ssd_opt_no > 0) { + HR_DEBUG("unsupported optional metadata\n"); + rc = EINVAL; + goto error; + } + md->ssdi.ssd_secsize = uint32_t_le2host(scratch_md->ssdi.ssd_secsize); + if (md->ssdi.ssd_secsize != DEV_BSIZE) { + HR_DEBUG("unsupported sector size\n"); + rc = EINVAL; + goto error; + } + + md->ssdi.ssd_volid = uint32_t_le2host(scratch_md->ssdi.ssd_volid); + md->ssdi.ssd_level = uint32_t_le2host(scratch_md->ssdi.ssd_level); + md->ssdi.ssd_size = int64_t_le2host(scratch_md->ssdi.ssd_size); + memcpy(md->ssdi.ssd_vendor, scratch_md->ssdi.ssd_vendor, 8); + memcpy(md->ssdi.ssd_product, scratch_md->ssdi.ssd_product, 16); + memcpy(md->ssdi.ssd_revision, scratch_md->ssdi.ssd_revision, 4); + md->ssdi.ssd_strip_size = + uint32_t_le2host(scratch_md->ssdi.ssd_strip_size); + + rc = create_hash((const uint8_t *)&scratch_md->ssdi, + sizeof(struct sr_meta_invariant), md5_hash, HASH_MD5); + assert(rc == EOK); + if (memcmp(md5_hash, scratch_md->ssd_checksum, 16) != 0) { + HR_DEBUG("ssd_checksum invalid\n"); + rc = EINVAL; + goto error; + } + + memcpy(md->ssd_checksum, scratch_md->ssd_checksum, MD5_DIGEST_LENGTH); + + memcpy(md->ssd_devname, scratch_md->ssd_devname, 32); + md->ssd_meta_flags = uint32_t_le2host(scratch_md->ssd_meta_flags); + md->ssd_data_blkno = uint32_t_le2host(scratch_md->ssd_data_blkno); + md->ssd_ondisk = uint64_t_le2host(scratch_md->ssd_ondisk); + md->ssd_rebuild = int64_t_le2host(scratch_md->ssd_rebuild); + + struct sr_meta_chunk *scratch_mc = + (struct sr_meta_chunk *)(scratch_md + 1); + struct sr_meta_chunk *mc = (struct sr_meta_chunk *)(md + 1); + for (size_t i = 0; i < md->ssdi.ssd_chunk_no; i++, mc++, scratch_mc++) { + mc->scmi.scm_volid = + uint32_t_le2host(scratch_mc->scmi.scm_volid); + mc->scmi.scm_chunk_id = + uint32_t_le2host(scratch_mc->scmi.scm_chunk_id); + memcpy(mc->scmi.scm_devname, scratch_mc->scmi.scm_devname, 32); + mc->scmi.scm_size = int64_t_le2host(scratch_mc->scmi.scm_size); + mc->scmi.scm_coerced_size = + int64_t_le2host(scratch_mc->scmi.scm_coerced_size); + memcpy(&mc->scmi.scm_uuid, &scratch_mc->scmi.scm_uuid, + SR_UUID_MAX); + + memcpy(mc->scm_checksum, scratch_mc->scm_checksum, + MD5_DIGEST_LENGTH); + mc->scm_status = uint32_t_le2host(scratch_mc->scm_status); + + /* + * This commented piece of code found a bug in + * OpenBSD softraid chunk metadata initialization, + * fix has been proposed [1], if it is fixed, feel free + * to uncomment, although it will work only on new + * volumes. + * + * [1]: https://marc.info/?l=openbsd-tech&m=174535579711235&w=2 + */ + /* + * rc = create_hash((const uint8_t *)&scratch_mc->scmi, + * sizeof(struct sr_meta_chunk_invariant), md5_hash, HASH_MD5); + * assert(rc == EOK); + * if (memcmp(md5_hash, mc->scm_checksum, 16) != 0) { + * HR_DEBUG("chunk %zu, scm_checksum invalid\n", i); + * rc = EINVAL; + * goto error; + * } + */ + } + + struct sr_meta_opt_hdr *scratch_om = + (struct sr_meta_opt_hdr *)((u_int8_t *)(scratch_md + 1) + + sizeof(struct sr_meta_chunk) * md->ssdi.ssd_chunk_no); + struct sr_meta_opt_hdr *om = + (struct sr_meta_opt_hdr *)((u_int8_t *)(md + 1) + + sizeof(struct sr_meta_chunk) * md->ssdi.ssd_chunk_no); + for (size_t i = 0; i < md->ssdi.ssd_opt_no; i++) { + om->som_type = uint32_t_le2host(scratch_om->som_type); + om->som_length = uint32_t_le2host(scratch_om->som_length); + memcpy(om->som_checksum, scratch_om->som_checksum, + MD5_DIGEST_LENGTH); + + /* + * No need to do checksum, we don't support optional headers. + * Despite this, still load it the headers. + */ + + om = (struct sr_meta_opt_hdr *)((void *)om + + om->som_length); + scratch_om = (struct sr_meta_opt_hdr *)((void *)scratch_om + + om->som_length); + } + +error: + free(scratch_md); + + return rc; +} + +static errno_t meta_softraid_get_block(service_id_t dev, void **rblock) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + void *block; + + if (rblock == NULL) + return EINVAL; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize != DEV_BSIZE) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < SR_META_OFFSET + SR_META_SIZE) + return EINVAL; + + block = malloc(bsize * SR_META_SIZE); + if (block == NULL) + return ENOMEM; + + rc = hr_read_direct(dev, SR_META_OFFSET, SR_META_SIZE, block); + if (rc != EOK) { + free(block); + return rc; + } + + *rblock = block; + return EOK; +} + +static errno_t meta_softraid_write_block(service_id_t dev, const void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize != 512) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < SR_META_OFFSET + SR_META_SIZE) + return EINVAL; + + rc = hr_write_direct(dev, SR_META_OFFSET, SR_META_SIZE, block); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/foreign/softraid/softraid.c b/uspace/srv/bd/hr/metadata/foreign/softraid/softraid.c new file mode 100644 index 0000000000..06becb926e --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/softraid/softraid.c @@ -0,0 +1,126 @@ +/* $OpenBSD: softraid.c,v 1.429 2022/12/21 09:54:23 kn Exp $ */ +/* + * Copyright (c) 2007, 2008, 2009 Marco Peereboom + * Copyright (c) 2008 Chris Kuethe + * Copyright (c) 2009 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* stripped down softraid.c */ + +#include +#include + +#include "softraidvar.h" + +void +sr_checksum_print(const u_int8_t *md5) +{ + int i; + + for (i = 0; i < MD5_DIGEST_LENGTH; i++) + printf("%02x", md5[i]); +} + +char * +sr_uuid_format(const struct sr_uuid *uuid) +{ + char *uuidstr; + + /* changed to match HelenOS malloc() */ + uuidstr = malloc(37); + if (uuidstr == NULL) + return NULL; + + snprintf(uuidstr, 37, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x%02x%02x%02x%02x", + uuid->sui_id[0], uuid->sui_id[1], + uuid->sui_id[2], uuid->sui_id[3], + uuid->sui_id[4], uuid->sui_id[5], + uuid->sui_id[6], uuid->sui_id[7], + uuid->sui_id[8], uuid->sui_id[9], + uuid->sui_id[10], uuid->sui_id[11], + uuid->sui_id[12], uuid->sui_id[13], + uuid->sui_id[14], uuid->sui_id[15]); + + return uuidstr; +} + +void +sr_uuid_print(const struct sr_uuid *uuid, int cr) +{ + char *uuidstr; + + uuidstr = sr_uuid_format(uuid); + printf("%s%s", uuidstr, (cr ? "\n" : "")); + free(uuidstr); +} + +void +sr_meta_print(const struct sr_metadata *m) +{ + unsigned i; + struct sr_meta_chunk *mc; + struct sr_meta_opt_hdr *omh; + + printf("\tssd_magic 0x%" PRIx64 "\n", m->ssdi.ssd_magic); + printf("\tssd_version %" PRId32 "\n", m->ssdi.ssd_version); + printf("\tssd_vol_flags 0x%" PRIx32 "\n", m->ssdi.ssd_vol_flags); + printf("\tssd_uuid "); + sr_uuid_print(&m->ssdi.ssd_uuid, 1); + printf("\tssd_chunk_no %" PRId32 "\n", m->ssdi.ssd_chunk_no); + printf("\tssd_chunk_id %" PRId32 "\n", m->ssdi.ssd_chunk_id); + printf("\tssd_opt_no %" PRId32 "\n", m->ssdi.ssd_opt_no); + printf("\tssd_volid %" PRId32 "\n", m->ssdi.ssd_volid); + printf("\tssd_level %" PRId32 "\n", m->ssdi.ssd_level); + printf("\tssd_size %" PRId64 "\n", m->ssdi.ssd_size); + printf("\tssd_devname %s\n", m->ssd_devname); + printf("\tssd_vendor %s\n", m->ssdi.ssd_vendor); + printf("\tssd_product %s\n", m->ssdi.ssd_product); + printf("\tssd_revision %s\n", m->ssdi.ssd_revision); + printf("\tssd_strip_size %" PRId32 "\n", m->ssdi.ssd_strip_size); + printf("\tssd_checksum "); + sr_checksum_print(m->ssd_checksum); + printf("\n"); + printf("\tssd_meta_flags 0x%" PRIx32 "\n", m->ssd_meta_flags); + printf("\tssd_ondisk %" PRId64 "\n", m->ssd_ondisk); + printf("\tssd_rebuild %" PRId64 "\n", m->ssd_rebuild); + + mc = (struct sr_meta_chunk *)(m + 1); + for (i = 0; i < m->ssdi.ssd_chunk_no; i++, mc++) { + printf("\t\tscm_volid %" PRId32 "\n", mc->scmi.scm_volid); + printf("\t\tscm_chunk_id %" PRId32 "\n", mc->scmi.scm_chunk_id); + printf("\t\tscm_devname %s\n", mc->scmi.scm_devname); + printf("\t\tscm_size %" PRId64 "\n", mc->scmi.scm_size); + printf("\t\tscm_coerced_size %" PRId64 "\n", mc->scmi.scm_coerced_size); + printf("\t\tscm_uuid "); + sr_uuid_print(&mc->scmi.scm_uuid, 1); + printf("\t\tscm_checksum "); + sr_checksum_print(mc->scm_checksum); + printf("\n"); + printf("\t\tscm_status %" PRId32 "\n", mc->scm_status); + } + + omh = (struct sr_meta_opt_hdr *)((u_int8_t *)(m + 1) + + sizeof(struct sr_meta_chunk) * m->ssdi.ssd_chunk_no); + for (i = 0; i < m->ssdi.ssd_opt_no; i++) { + printf("\t\t\tsom_type %" PRId32 "\n", omh->som_type); + printf("\t\t\tsom_checksum "); + sr_checksum_print(omh->som_checksum); + printf("\n"); + omh = (struct sr_meta_opt_hdr *)((void *)omh + + omh->som_length); + } +} diff --git a/uspace/srv/bd/hr/metadata/foreign/softraid/softraidvar.h b/uspace/srv/bd/hr/metadata/foreign/softraid/softraidvar.h new file mode 100644 index 0000000000..927fd60165 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/foreign/softraid/softraidvar.h @@ -0,0 +1,154 @@ +/* $OpenBSD: softraidvar.h,v 1.176 2022/12/19 15:27:06 kn Exp $ */ +/* + * Copyright (c) 2006 Marco Peereboom + * Copyright (c) 2008 Chris Kuethe + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_METADATA_FOREIGN_SOFTRAID_H +#define _HR_METADATA_FOREIGN_SOFTRAID_H + +/* HelenOS specific includes, retypes */ + +#include + +typedef uint8_t u_int8_t; +typedef uint16_t u_int16_t; +typedef uint32_t u_int32_t; +typedef uint64_t u_int64_t; + +/* copied from */ +#define _DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ +#define DEV_BSIZE (1 << _DEV_BSHIFT) + +/* copied from :struct bioc_disk */ +#define BIOC_SDONLINE 0x00 +#define BIOC_SDOFFLINE 0x01 +#define BIOC_SDFAILED 0x02 +#define BIOC_SDREBUILD 0x03 +#define BIOC_SDHOTSPARE 0x04 +#define BIOC_SDUNUSED 0x05 +#define BIOC_SDSCRUB 0x06 +#define BIOC_SDINVALID 0xff + +/* here continues stripped down and slightly modified softraidvat.h */ + +#define MD5_DIGEST_LENGTH 16 + +#define SR_META_VERSION 6 /* bump when sr_metadata changes */ +#define SR_META_SIZE 64 /* save space at chunk beginning */ +#define SR_META_OFFSET 16 /* skip 8192 bytes at chunk beginning */ + +#define SR_BOOT_OFFSET (SR_META_OFFSET + SR_META_SIZE) +#define SR_BOOT_LOADER_SIZE 320 /* Size of boot loader storage. */ +#define SR_BOOT_LOADER_OFFSET SR_BOOT_OFFSET +#define SR_BOOT_BLOCKS_SIZE 128 /* Size of boot block storage. */ +#define SR_BOOT_BLOCKS_OFFSET (SR_BOOT_LOADER_OFFSET + SR_BOOT_LOADER_SIZE) +#define SR_BOOT_SIZE (SR_BOOT_LOADER_SIZE + SR_BOOT_BLOCKS_SIZE) + +#define SR_HEADER_SIZE (SR_META_SIZE + SR_BOOT_SIZE) +#define SR_DATA_OFFSET (SR_META_OFFSET + SR_HEADER_SIZE) + +#define SR_UUID_MAX 16 +struct sr_uuid { + u_int8_t sui_id[SR_UUID_MAX]; +} __attribute__((packed)); + +struct sr_metadata { + struct sr_meta_invariant { + /* do not change order of ssd_magic, ssd_version */ + u_int64_t ssd_magic; /* magic id */ +#define SR_MAGIC 0x4d4152436372616dLLU + u_int32_t ssd_version; /* meta data version */ + u_int32_t ssd_vol_flags; /* volume specific flags. */ + struct sr_uuid ssd_uuid; /* unique identifier */ + + /* chunks */ + u_int32_t ssd_chunk_no; /* number of chunks */ + u_int32_t ssd_chunk_id; /* chunk identifier */ + + /* optional */ + u_int32_t ssd_opt_no; /* nr of optional md elements */ + u_int32_t ssd_secsize; + + /* volume metadata */ + u_int32_t ssd_volid; /* volume id */ + u_int32_t ssd_level; /* raid level */ + int64_t ssd_size; /* virt disk size in blocks */ + char ssd_vendor[8]; /* scsi vendor */ + char ssd_product[16];/* scsi product */ + char ssd_revision[4];/* scsi revision */ + /* optional volume members */ + u_int32_t ssd_strip_size; /* strip size */ + } _sdd_invariant; +#define ssdi _sdd_invariant + /* MD5 of invariant metadata */ + u_int8_t ssd_checksum[MD5_DIGEST_LENGTH]; + char ssd_devname[32];/* /dev/XXXXX */ + u_int32_t ssd_meta_flags; +#define SR_META_DIRTY 0x1 + u_int32_t ssd_data_blkno; + u_int64_t ssd_ondisk; /* on disk version counter */ + int64_t ssd_rebuild; /* last block of rebuild */ +} __attribute__((packed)); + +struct sr_meta_chunk { + struct sr_meta_chunk_invariant { + u_int32_t scm_volid; /* vd we belong to */ + u_int32_t scm_chunk_id; /* chunk id */ + char scm_devname[32];/* /dev/XXXXX */ + int64_t scm_size; /* size of partition in blocks */ + int64_t scm_coerced_size; /* coerced sz of part in blk */ + struct sr_uuid scm_uuid; /* unique identifier */ + } _scm_invariant; +#define scmi _scm_invariant + /* MD5 of invariant chunk metadata */ + u_int8_t scm_checksum[MD5_DIGEST_LENGTH]; + u_int32_t scm_status; /* use bio bioc_disk status */ +} __attribute__((packed)); + +struct sr_meta_opt_hdr { + u_int32_t som_type; /* optional metadata type. */ + u_int32_t som_length; /* optional metadata length. */ + u_int8_t som_checksum[MD5_DIGEST_LENGTH]; +} __attribute__((packed)); + +#define SR_MD_RAID0 0 +#define SR_MD_RAID1 1 +#define SR_MD_RAID5 2 +#define SR_MD_CACHE 3 +#define SR_MD_CRYPTO 4 +/* AOE was 5 and 6. */ +/* SR_MD_RAID4 was 7. */ +#define SR_MD_RAID6 8 +#define SR_MD_CONCAT 9 +#define SR_MD_RAID1C 10 + +/* functions to export from softraid.c to hr_softraid.c */ +void sr_checksum_print(const u_int8_t *); +char *sr_uuid_format(const struct sr_uuid *); +void sr_uuid_print(const struct sr_uuid *, int); +void sr_meta_print(const struct sr_metadata *); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/native.c b/uspace/srv/bd/hr/metadata/native.c new file mode 100644 index 0000000000..1062452508 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/native.c @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../io.h" +#include "../util.h" +#include "../var.h" + +#include "native.h" + +/* not exposed */ +static void *meta_native_alloc_struct(void); +static void meta_native_encode(void *, void *); +static errno_t meta_native_decode(const void *, void *); +static errno_t meta_native_get_block(service_id_t, void **); +static errno_t meta_native_write_block(service_id_t, const void *); +static bool meta_native_has_valid_magic(const void *); + +static errno_t meta_native_probe(service_id_t, void **); +static errno_t meta_native_init_vol2meta(hr_volume_t *); +static errno_t meta_native_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_native_erase_block(service_id_t); +static bool meta_native_compare_uuids(const void *, const void *); +static void meta_native_inc_counter(hr_volume_t *); +static errno_t meta_native_save(hr_volume_t *, bool); +static errno_t meta_native_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_native_get_devname(const void *); +static hr_level_t meta_native_get_level(const void *); +static uint64_t meta_native_get_data_offset(void); +static size_t meta_native_get_size(void); +static uint8_t meta_native_get_flags(void); +static hr_metadata_type_t meta_native_get_type(void); +static void meta_native_dump(const void *); + +hr_superblock_ops_t metadata_native_ops = { + .probe = meta_native_probe, + .init_vol2meta = meta_native_init_vol2meta, + .init_meta2vol = meta_native_init_meta2vol, + .erase_block = meta_native_erase_block, + .compare_uuids = meta_native_compare_uuids, + .inc_counter = meta_native_inc_counter, + .save = meta_native_save, + .save_ext = meta_native_save_ext, + .get_devname = meta_native_get_devname, + .get_level = meta_native_get_level, + .get_data_offset = meta_native_get_data_offset, + .get_size = meta_native_get_size, + .get_flags = meta_native_get_flags, + .get_type = meta_native_get_type, + .dump = meta_native_dump +}; + +static errno_t meta_native_probe(service_id_t svc_id, void **rmd) +{ + errno_t rc; + void *meta_block; + + void *metadata_struct = meta_native_alloc_struct(); + if (metadata_struct == NULL) + return ENOMEM; + + rc = meta_native_get_block(svc_id, &meta_block); + if (rc != EOK) + goto error; + + rc = meta_native_decode(meta_block, metadata_struct); + + free(meta_block); + + if (rc != EOK) + goto error; + + *rmd = metadata_struct; + return EOK; + +error: + free(metadata_struct); + return rc; +} + +static errno_t meta_native_init_vol2meta(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + hr_metadata_t *md = calloc(1, sizeof(*md)); + if (md == NULL) + return ENOMEM; + + str_cpy(md->magic, HR_NATIVE_MAGIC_SIZE, HR_NATIVE_MAGIC_STR); + + md->version = HR_NATIVE_METADATA_VERSION; + + md->counter = 0; + + uuid_t uuid; + /* rndgen */ + fibril_usleep(1000); + errno_t rc = uuid_generate(&uuid); + if (rc != EOK) + return rc; + + memcpy(md->uuid, &uuid, HR_NATIVE_UUID_LEN); + + md->data_blkno = vol->data_blkno; + md->truncated_blkno = vol->truncated_blkno; + md->extent_no = vol->extent_no; + md->level = vol->level; + md->layout = vol->layout; + md->strip_size = vol->strip_size; + md->bsize = vol->bsize; + memcpy(md->devname, vol->devname, HR_DEVNAME_LEN); + + vol->in_mem_md = md; + + return EOK; +} + +static errno_t meta_native_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + hr_metadata_t *main_meta = NULL; + uint64_t max_counter_val = 0; + + list_foreach(*list, link, struct dev_list_member, iter) { + hr_metadata_t *iter_meta = (hr_metadata_t *)iter->md; + if (iter_meta->counter >= max_counter_val) { + max_counter_val = iter_meta->counter; + main_meta = iter_meta; + } + } + + assert(main_meta != NULL); + + vol->data_blkno = main_meta->data_blkno; + vol->truncated_blkno = main_meta->truncated_blkno; + vol->data_offset = meta_native_get_data_offset(); + vol->extent_no = main_meta->extent_no; + /* vol->level = main_meta->level; */ /* already set */ + vol->layout = main_meta->layout; + vol->strip_size = main_meta->strip_size; + vol->bsize = main_meta->bsize; + /* already set */ + /* memcpy(vol->devname, main_meta->devname, HR_DEVNAME_LEN); */ + + if (vol->extent_no > HR_MAX_EXTENTS) { + HR_DEBUG("Assembled volume has %u extents (max = %u)", + (unsigned)vol->extent_no, HR_MAX_EXTENTS); + return EINVAL; + } + + vol->in_mem_md = calloc(1, sizeof(hr_metadata_t)); + if (vol->in_mem_md == NULL) + return ENOMEM; + memcpy(vol->in_mem_md, main_meta, sizeof(hr_metadata_t)); + + list_foreach(*list, link, struct dev_list_member, iter) { + hr_metadata_t *iter_meta = (hr_metadata_t *)iter->md; + + vol->extents[iter_meta->index].svc_id = iter->svc_id; + + hr_ext_state_t final_ext_state = HR_EXT_INVALID; + if (iter_meta->counter == max_counter_val) { + if (iter_meta->rebuild_pos > 0) { + final_ext_state = HR_EXT_REBUILD; + vol->rebuild_blk = iter_meta->rebuild_pos; + } else { + final_ext_state = HR_EXT_ONLINE; + } + } + + vol->extents[iter_meta->index].state = final_ext_state; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_NONE) + vol->extents[i].state = HR_EXT_MISSING; + } + + return EOK; +} + +static errno_t meta_native_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + void *zero_block = calloc(1, bsize); + if (zero_block == NULL) + return ENOMEM; + + rc = meta_native_write_block(dev, zero_block); + return rc; +} + +static bool meta_native_compare_uuids(const void *m1p, const void *m2p) +{ + const hr_metadata_t *m1 = m1p; + const hr_metadata_t *m2 = m2p; + if (memcmp(m1->uuid, m2->uuid, HR_NATIVE_UUID_LEN) == 0) + return true; + + return false; +} + +static void meta_native_inc_counter(hr_volume_t *vol) +{ + fibril_mutex_lock(&vol->md_lock); + + hr_metadata_t *md = vol->in_mem_md; + + md->counter++; + + fibril_mutex_unlock(&vol->md_lock); +} + +static errno_t meta_native_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + fibril_rwlock_read_lock(&vol->extents_lock); + + for (size_t i = 0; i < vol->extent_no; i++) + meta_native_save_ext(vol, i, with_state_callback); + + fibril_rwlock_read_unlock(&vol->extents_lock); + + return EOK; +} + +static errno_t meta_native_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + void *md_block = hr_calloc_waitok(1, vol->bsize); + + hr_metadata_t *md = (hr_metadata_t *)vol->in_mem_md; + + hr_extent_t *ext = &vol->extents[ext_idx]; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_ext_state_t s = ext->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (s != HR_EXT_ONLINE && s != HR_EXT_REBUILD) { + return EINVAL; + } + + fibril_mutex_lock(&vol->md_lock); + + md->index = ext_idx; + if (s == HR_EXT_REBUILD) + md->rebuild_pos = vol->rebuild_blk; + else + md->rebuild_pos = 0; + meta_native_encode(md, md_block); + errno_t rc = meta_native_write_block(ext->svc_id, md_block); + if (rc != EOK && with_state_callback) + vol->hr_ops.ext_state_cb(vol, ext_idx, rc); + + fibril_mutex_unlock(&vol->md_lock); + + if (with_state_callback) + vol->hr_ops.vol_state_eval(vol); + + free(md_block); + return EOK; +} + +static const char *meta_native_get_devname(const void *md_v) +{ + const hr_metadata_t *md = md_v; + + return md->devname; +} + +static hr_level_t meta_native_get_level(const void *md_v) +{ + const hr_metadata_t *md = md_v; + + return md->level; +} + +static uint64_t meta_native_get_data_offset(void) +{ + return HR_NATIVE_DATA_OFF; +} + +static size_t meta_native_get_size(void) +{ + return HR_NATIVE_META_SIZE; +} + +static uint8_t meta_native_get_flags(void) +{ + uint8_t flags = 0; + + flags |= HR_METADATA_HOTSPARE_SUPPORT; + flags |= HR_METADATA_ALLOW_REBUILD; + + return flags; +} + +static hr_metadata_type_t meta_native_get_type(void) +{ + return HR_METADATA_NATIVE; +} + +static void meta_native_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + const hr_metadata_t *metadata = md_v; + + printf("\tmagic: %s\n", metadata->magic); + printf("\tUUID: "); + for (size_t i = 0; i < HR_NATIVE_UUID_LEN; ++i) { + printf("%.2X", metadata->uuid[i]); + if (i + 1 < HR_NATIVE_UUID_LEN) + printf(" "); + } + printf("\n"); + printf("\tdata_blkno: %" PRIu64 "\n", metadata->data_blkno); + printf("\ttruncated_blkno: %" PRIu64 "\n", metadata->truncated_blkno); + printf("\tcounter: %" PRIu64 "\n", metadata->counter); + printf("\tversion: %" PRIu32 "\n", metadata->version); + printf("\textent_no: %" PRIu32 "\n", metadata->extent_no); + printf("\tindex: %" PRIu32 "\n", metadata->index); + printf("\tlevel: %" PRIu32 "\n", metadata->level); + printf("\tlayout: %" PRIu32 "\n", metadata->layout); + printf("\tstrip_size: %" PRIu32 "\n", metadata->strip_size); + printf("\tbsize: %" PRIu32 "\n", metadata->bsize); + printf("\tdevname: %s\n", metadata->devname); +} + +static void *meta_native_alloc_struct(void) +{ + return calloc(1, sizeof(hr_metadata_t)); +} + +static void meta_native_encode(void *md_v, void *block) +{ + HR_DEBUG("%s()", __func__); + + const hr_metadata_t *metadata = md_v; + + /* + * Use scratch metadata for easier encoding without the need + * for manualy specifying offsets. + */ + hr_metadata_t scratch_md; + + memcpy(scratch_md.magic, metadata->magic, HR_NATIVE_MAGIC_SIZE); + memcpy(scratch_md.uuid, metadata->uuid, HR_NATIVE_UUID_LEN); + + scratch_md.data_blkno = host2uint64_t_le(metadata->data_blkno); + scratch_md.truncated_blkno = host2uint64_t_le( + metadata->truncated_blkno); + scratch_md.counter = host2uint64_t_le(metadata->counter); + scratch_md.rebuild_pos = host2uint64_t_le(metadata->rebuild_pos); + scratch_md.version = host2uint32_t_le(metadata->version); + scratch_md.extent_no = host2uint32_t_le(metadata->extent_no); + scratch_md.index = host2uint32_t_le(metadata->index); + scratch_md.level = host2uint32_t_le(metadata->level); + scratch_md.layout = host2uint32_t_le(metadata->layout); + scratch_md.strip_size = host2uint32_t_le(metadata->strip_size); + scratch_md.bsize = host2uint32_t_le(metadata->bsize); + memcpy(scratch_md.devname, metadata->devname, HR_DEVNAME_LEN); + + memcpy(block, &scratch_md, sizeof(hr_metadata_t)); +} + +static errno_t meta_native_decode(const void *block, void *md_v) +{ + HR_DEBUG("%s()", __func__); + + hr_metadata_t *metadata = md_v; + + /* + * Use scratch metadata for easier decoding without the need + * for manualy specifying offsets. + */ + hr_metadata_t scratch_md; + memcpy(&scratch_md, block, sizeof(hr_metadata_t)); + + memcpy(metadata->magic, scratch_md.magic, HR_NATIVE_MAGIC_SIZE); + if (!meta_native_has_valid_magic(metadata)) + return EINVAL; + + memcpy(metadata->uuid, scratch_md.uuid, HR_NATIVE_UUID_LEN); + + metadata->data_blkno = uint64_t_le2host(scratch_md.data_blkno); + metadata->truncated_blkno = uint64_t_le2host( + scratch_md.truncated_blkno); + metadata->counter = uint64_t_le2host(scratch_md.counter); + metadata->rebuild_pos = uint64_t_le2host(scratch_md.rebuild_pos); + metadata->version = uint32_t_le2host(scratch_md.version); + metadata->extent_no = uint32_t_le2host(scratch_md.extent_no); + metadata->index = uint32_t_le2host(scratch_md.index); + metadata->level = uint32_t_le2host(scratch_md.level); + metadata->layout = uint32_t_le2host(scratch_md.layout); + metadata->strip_size = uint32_t_le2host(scratch_md.strip_size); + metadata->bsize = uint32_t_le2host(scratch_md.bsize); + memcpy(metadata->devname, scratch_md.devname, HR_DEVNAME_LEN); + + if (metadata->version != 1) + return EINVAL; + + return EOK; +} + +static errno_t meta_native_get_block(service_id_t dev, void **rblock) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + void *block; + + if (rblock == NULL) + return EINVAL; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(hr_metadata_t)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < HR_NATIVE_META_SIZE) + return EINVAL; + + block = malloc(bsize); + if (block == NULL) + return ENOMEM; + + rc = hr_read_direct(dev, blkno - 1, HR_NATIVE_META_SIZE, block); + if (rc != EOK) { + free(block); + return rc; + } + + *rblock = block; + return EOK; +} + +static errno_t meta_native_write_block(service_id_t dev, const void *block) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno; + size_t bsize; + + rc = block_get_bsize(dev, &bsize); + if (rc != EOK) + return rc; + + if (bsize < sizeof(hr_metadata_t)) + return EINVAL; + + rc = block_get_nblocks(dev, &blkno); + if (rc != EOK) + return rc; + + if (blkno < HR_NATIVE_META_SIZE) + return EINVAL; + + rc = hr_write_direct(dev, blkno - 1, HR_NATIVE_META_SIZE, block); + + return rc; +} + +static bool meta_native_has_valid_magic(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + const hr_metadata_t *md = md_v; + + if (str_lcmp(md->magic, HR_NATIVE_MAGIC_STR, HR_NATIVE_MAGIC_SIZE) != 0) + return false; + + return true; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/native.h b/uspace/srv/bd/hr/metadata/native.h new file mode 100644 index 0000000000..8f7dfe076d --- /dev/null +++ b/uspace/srv/bd/hr/metadata/native.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_METADATA_NATIVE_H +#define _HR_METADATA_NATIVE_H + +#include "../var.h" + +/* + * Metadata is stored on the last block of an extent. + */ +#define HR_NATIVE_META_SIZE 1 /* in blocks */ +#define HR_NATIVE_DATA_OFF 0 + +#define HR_NATIVE_MAGIC_STR "HelenRAID" +#define HR_NATIVE_MAGIC_SIZE 16 +#define HR_NATIVE_UUID_LEN 16 + +/* + * Bump on each superblock update. + */ +#define HR_NATIVE_METADATA_VERSION 1 + +struct hr_metadata { + char magic[HR_NATIVE_MAGIC_SIZE]; + + uint8_t uuid[HR_NATIVE_UUID_LEN]; + + uint32_t version; + uint32_t extent_no; + uint32_t level; + uint32_t layout; + + uint32_t index; /* index of extent in volume */ + uint32_t strip_size; + uint32_t bsize; + + uint64_t data_blkno; /* usable blocks */ + uint64_t truncated_blkno; /* size of smallest extent */ + + uint64_t counter; + uint64_t rebuild_pos; + + char devname[HR_DEVNAME_LEN]; +} __attribute__((packed)); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/metadata/noop.c b/uspace/srv/bd/hr/metadata/noop.c new file mode 100644 index 0000000000..d0d76d83a5 --- /dev/null +++ b/uspace/srv/bd/hr/metadata/noop.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include + +#include "../superblock.h" +#include "../util.h" +#include "../var.h" + +static errno_t meta_noop_probe(service_id_t, void **); +static errno_t meta_noop_init_vol2meta(hr_volume_t *); +static errno_t meta_noop_init_meta2vol(const list_t *, hr_volume_t *); +static errno_t meta_noop_erase_block(service_id_t); +static bool meta_noop_compare_uuids(const void *, const void *); +static void meta_noop_inc_counter(hr_volume_t *); +static errno_t meta_noop_save(hr_volume_t *, bool); +static errno_t meta_noop_save_ext(hr_volume_t *, size_t, bool); +static const char *meta_noop_get_devname(const void *); +static hr_level_t meta_noop_get_level(const void *); +static uint64_t meta_noop_get_data_offset(void); +static size_t meta_noop_get_size(void); +static uint8_t meta_noop_get_flags(void); +static hr_metadata_type_t meta_noop_get_type(void); +static void meta_noop_dump(const void *); + +hr_superblock_ops_t noop_ops = { + .probe = meta_noop_probe, + .init_vol2meta = meta_noop_init_vol2meta, + .init_meta2vol = meta_noop_init_meta2vol, + .erase_block = meta_noop_erase_block, + .compare_uuids = meta_noop_compare_uuids, + .inc_counter = meta_noop_inc_counter, + .save = meta_noop_save, + .save_ext = meta_noop_save_ext, + .get_devname = meta_noop_get_devname, + .get_level = meta_noop_get_level, + .get_data_offset = meta_noop_get_data_offset, + .get_size = meta_noop_get_size, + .get_flags = meta_noop_get_flags, + .get_type = meta_noop_get_type, + .dump = meta_noop_dump +}; + +static errno_t meta_noop_probe(service_id_t svc_id, void **rmd) +{ + HR_DEBUG("%s()", __func__); + + return ENOTSUP; +} + +static errno_t meta_noop_init_vol2meta(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + return EOK; +} + +static errno_t meta_noop_init_meta2vol(const list_t *list, hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + return ENOTSUP; +} + +static errno_t meta_noop_erase_block(service_id_t dev) +{ + HR_DEBUG("%s()", __func__); + + return EOK; +} + +static bool meta_noop_compare_uuids(const void *m1p, const void *m2p) +{ + return false; +} + +static void meta_noop_inc_counter(hr_volume_t *vol) +{ + (void)vol; +} + +static errno_t meta_noop_save(hr_volume_t *vol, bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + return EOK; +} + +static errno_t meta_noop_save_ext(hr_volume_t *vol, size_t ext_idx, + bool with_state_callback) +{ + HR_DEBUG("%s()", __func__); + + return EOK; +} + +static const char *meta_noop_get_devname(const void *md_v) +{ + return NULL; +} + +static hr_level_t meta_noop_get_level(const void *md_v) +{ + return HR_LVL_UNKNOWN; +} + +static uint64_t meta_noop_get_data_offset(void) +{ + return 0; +} + +static size_t meta_noop_get_size(void) +{ + return 0; +} + +static uint8_t meta_noop_get_flags(void) +{ + HR_DEBUG("%s()", __func__); + + uint8_t flags = 0; + + flags |= HR_METADATA_HOTSPARE_SUPPORT; + flags |= HR_METADATA_ALLOW_REBUILD; + + return flags; +} + +static hr_metadata_type_t meta_noop_get_type(void) +{ + HR_DEBUG("%s()", __func__); + + return HR_METADATA_NOOP; +} + +static void meta_noop_dump(const void *md_v) +{ + HR_DEBUG("%s()", __func__); + + printf("NOOP Metadata\n"); +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/parity_stripe.c b/uspace/srv/bd/hr/parity_stripe.c new file mode 100644 index 0000000000..88c3e747a7 --- /dev/null +++ b/uspace/srv/bd/hr/parity_stripe.c @@ -0,0 +1,853 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include + +#include "io.h" +#include "parity_stripe.h" +#include "util.h" +#include "var.h" + +static void hr_execute_write_stripe_degraded_mixed(hr_stripe_t *, size_t); +static void hr_execute_write_stripe_degraded(hr_stripe_t *, size_t); +static void hr_execute_write_stripe_optimal_reconstruct(hr_stripe_t *); +static void hr_execute_write_stripe_subtract(hr_stripe_t *, size_t); +static void hr_execute_write_stripe(hr_stripe_t *, size_t); +static void hr_execute_read_stripe(hr_stripe_t *, size_t); +static bool hr_stripe_range_non_extension(const range_t *, const range_t *, + range_t *); +static size_t hr_stripe_merge_extent_spans(hr_stripe_t *, size_t, range_t [2]); +static void hr_stripe_extend_range(range_t *, const range_t *); +static bool hr_ranges_overlap(const range_t *, const range_t *, range_t *); + +hr_stripe_t *hr_create_stripes(hr_volume_t *vol, uint64_t strip_size, + size_t cnt, bool write) +{ + hr_stripe_t *stripes = hr_calloc_waitok(cnt, sizeof(*stripes)); + + for (size_t i = 0; i < cnt; i++) { + fibril_mutex_initialize(&stripes[i].parity_lock); + fibril_condvar_initialize(&stripes[i].ps_added_cv); + stripes[i].vol = vol; + stripes[i].write = write; + stripes[i].parity = hr_calloc_waitok(1, strip_size); + stripes[i].parity_size = strip_size; + stripes[i].extent_span = hr_calloc_waitok(vol->extent_no, + sizeof(*stripes[i].extent_span)); + } + + return stripes; +} + +void hr_destroy_stripes(hr_stripe_t *stripes, size_t cnt) +{ + if (stripes == NULL) + return; + + for (size_t i = 0; i < cnt; i++) { + if (stripes[i].parity != NULL) + free(stripes[i].parity); + if (stripes[i].extent_span != NULL) + free(stripes[i].extent_span); + } + + free(stripes); +} + +void hr_reset_stripe(hr_stripe_t *stripe) +{ + memset(stripe->parity, 0, stripe->parity_size); + stripe->ps_added = 0; + stripe->ps_to_be_added = 0; + stripe->p_count_final = false; + + stripe->rc = EOK; + stripe->abort = false; + stripe->done = false; +} + +void hr_stripe_commit_parity(hr_stripe_t *stripe, uint64_t strip_off, + const void *data, uint64_t size) +{ + fibril_mutex_lock(&stripe->parity_lock); + hr_raid5_xor(stripe->parity + strip_off, data, size); + stripe->ps_added++; + fibril_condvar_broadcast(&stripe->ps_added_cv); + fibril_mutex_unlock(&stripe->parity_lock); +} + +void hr_stripe_wait_for_parity_commits(hr_stripe_t *stripe) +{ + fibril_mutex_lock(&stripe->parity_lock); + while ((!stripe->p_count_final || + stripe->ps_added < stripe->ps_to_be_added) && !stripe->abort) { + fibril_condvar_wait(&stripe->ps_added_cv, &stripe->parity_lock); + } + fibril_mutex_unlock(&stripe->parity_lock); +} + +void hr_stripe_parity_abort(hr_stripe_t *stripe) +{ + fibril_mutex_lock(&stripe->parity_lock); + stripe->abort = true; + fibril_condvar_broadcast(&stripe->ps_added_cv); + fibril_mutex_unlock(&stripe->parity_lock); +} + +void hr_execute_stripe(hr_stripe_t *stripe, size_t bad_extent) +{ + if (stripe->write) + hr_execute_write_stripe(stripe, bad_extent); + else + hr_execute_read_stripe(stripe, bad_extent); +} + +void hr_wait_for_stripe(hr_stripe_t *stripe) +{ + stripe->rc = hr_fgroup_wait(stripe->worker_group, NULL, NULL); + if (stripe->rc == EAGAIN) + hr_reset_stripe(stripe); + else + stripe->done = true; +} + +static void hr_execute_write_stripe_degraded_mixed(hr_stripe_t *stripe, + size_t bad_extent) +{ + hr_volume_t *vol = stripe->vol; + + stripe->range_count = hr_stripe_merge_extent_spans(stripe, + vol->extent_no, stripe->total_height); + + size_t worker_cnt = (vol->extent_no - 2) * 3 + 3; /* upper bound */ + stripe->worker_group = hr_fgroup_create(vol->fge, worker_cnt); + + stripe->ps_to_be_added = 1; + + hr_io_raid5_t *nop_write = hr_fgroup_alloc(stripe->worker_group); + nop_write->ba = stripe->extent_span[bad_extent].range.start; + nop_write->cnt = stripe->extent_span[bad_extent].cnt; + nop_write->strip_off = + stripe->extent_span[bad_extent].strip_off * vol->bsize; + nop_write->data_write = stripe->extent_span[bad_extent].data_write; + nop_write->vol = vol; + nop_write->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, hr_io_raid5_noop_writer, + nop_write); + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == bad_extent || e == stripe->p_extent) + continue; + + range_t uncommon = { 0, 0 }; + bool has_uncommon; + has_uncommon = hr_stripe_range_non_extension( + &stripe->extent_span[bad_extent].range, + &stripe->extent_span[e].range, + &uncommon); + + if (stripe->extent_span[e].cnt == 0 || has_uncommon) { + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + if (stripe->extent_span[bad_extent].cnt == 0) { + io->ba = + stripe->extent_span[bad_extent].range.start; + io->cnt = stripe->extent_span[bad_extent].cnt; + } else { + io->ba = uncommon.start; + io->cnt = uncommon.end - uncommon.start + 1; + } + io->strip_off = + stripe->extent_span[bad_extent].strip_off * + vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reconstruct_reader, io); + + if (stripe->extent_span[e].cnt == 0) + continue; + } + + range_t overlap_range; + bool overlap_up = true; + if (hr_ranges_overlap(&stripe->extent_span[e].range, + &stripe->extent_span[bad_extent].range, + &overlap_range)) { + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->ba = overlap_range.start; + io->cnt = overlap_range.end - overlap_range.start + 1; + + size_t diff = overlap_range.start - + stripe->extent_span[e].range.start; + + io->strip_off = + (stripe->extent_span[e].strip_off + diff) * + vol->bsize; + + io->data_write = stripe->extent_span[e].data_write; + io->data_write += diff * vol->bsize; + if (diff == 0) + overlap_up = false; + + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_writer, io); + } + + bool has_independent; + range_t independent = { 0, 0 }; + has_independent = hr_stripe_range_non_extension( + &stripe->extent_span[e].range, + &stripe->extent_span[bad_extent].range, + &independent); + if (has_independent) { + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->ba = independent.start; + io->cnt = independent.end - independent.start + 1; + size_t diff = 0; + if (!overlap_up) { + diff = overlap_range.end - + overlap_range.start + 1; + } + io->strip_off = + (stripe->extent_span[e].strip_off + diff) * + vol->bsize; + io->data_write = stripe->extent_span[e].data_write; + io->data_write += diff * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_subtract_writer, io); + } + } + + bool has_independent = false; + range_t independent = { 0, 0 }; + for (size_t r = 0; r < stripe->range_count; r++) { + has_independent = hr_stripe_range_non_extension( + &stripe->total_height[r], + &stripe->extent_span[bad_extent].range, + &independent); + if (has_independent) { + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = stripe->p_extent; + io->ba = independent.start; + io->cnt = independent.end - independent.start + 1; + + io->strip_off = io->ba; + hr_sub_data_offset(vol, &io->strip_off); + io->strip_off %= vol->strip_size / vol->bsize; + io->strip_off *= vol->bsize; + + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reconstruct_reader, io); + } + + hr_io_raid5_t *pio = hr_fgroup_alloc(stripe->worker_group); + pio->extent = stripe->p_extent; + pio->ba = stripe->total_height[r].start; + pio->cnt = stripe->total_height[r].end - + stripe->total_height[r].start + 1; + pio->strip_off = pio->ba; + hr_sub_data_offset(vol, &pio->strip_off); + pio->strip_off %= vol->strip_size / vol->bsize; + pio->strip_off *= vol->bsize; + pio->vol = vol; + pio->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_parity_writer, pio); + } + + stripe->p_count_final = true; + fibril_condvar_broadcast(&stripe->ps_added_cv); +} + +static void hr_execute_write_stripe_degraded(hr_stripe_t *stripe, + size_t bad_extent) +{ + hr_volume_t *vol = stripe->vol; + + /* parity is bad, issue non-redundant writes */ + if (bad_extent == stripe->p_extent) { + stripe->worker_group = + hr_fgroup_create(vol->fge, stripe->strips_touched); + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == bad_extent) + continue; + if (stripe->extent_span[e].cnt == 0) + continue; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->data_write = stripe->extent_span[e].data_write; + io->ba = stripe->extent_span[e].range.start; + io->cnt = stripe->extent_span[e].cnt; + io->strip_off = + stripe->extent_span[e].strip_off * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_basic_writer, io); + } + + return; + } + + if (stripe->extent_span[bad_extent].cnt > 0) + hr_execute_write_stripe_degraded_mixed(stripe, bad_extent); + else + hr_execute_write_stripe_subtract(stripe, bad_extent); +} + +static void hr_execute_write_stripe_optimal_reconstruct(hr_stripe_t *stripe) +{ + hr_volume_t *vol = stripe->vol; + + stripe->range_count = hr_stripe_merge_extent_spans(stripe, + vol->extent_no, stripe->total_height); + + bool full_stripe = false; + size_t worker_cnt; + if (stripe->strips_touched == vol->extent_no - 1 && + stripe->partial_strips_touched == 0) { + /* full-stripe */ + worker_cnt = stripe->strips_touched; /* writers */ + worker_cnt += 1; /* parity writer */ + + stripe->ps_to_be_added = stripe->strips_touched; + stripe->p_count_final = true; + + full_stripe = true; + } else { + worker_cnt = stripe->strips_touched; /* writers */ + + /* readers (upper bound) */ + worker_cnt += ((vol->extent_no - 1) - stripe->strips_touched) * + stripe->range_count; + worker_cnt += stripe->partial_strips_touched; + + worker_cnt += stripe->range_count; /* parity writer(s) */ + + stripe->ps_to_be_added = stripe->strips_touched; /* writers */ + } + + stripe->worker_group = hr_fgroup_create(vol->fge, worker_cnt); + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == stripe->p_extent) + continue; + + if (stripe->extent_span[e].cnt == 0) + continue; + + hr_io_raid5_t *io = hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->data_write = stripe->extent_span[e].data_write; + io->ba = stripe->extent_span[e].range.start; + io->cnt = stripe->extent_span[e].cnt; + io->strip_off = stripe->extent_span[e].strip_off * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, hr_io_raid5_writer, io); + } + + for (size_t r = 0; r < stripe->range_count; r++) { + if (full_stripe) + goto skip_readers; + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == stripe->p_extent) + continue; + + range_t range_extension = { 0, 0 }; + + bool need_reader = false; + if (stripe->extent_span[e].cnt == 0) { + range_extension = stripe->total_height[r]; + need_reader = true; + } else { + need_reader = hr_stripe_range_non_extension( + &stripe->total_height[r], + &stripe->extent_span[e].range, + &range_extension); + } + + if (need_reader) { + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->ba = range_extension.start; + io->cnt = range_extension.end - + range_extension.start + 1; + io->vol = vol; + io->stripe = stripe; + + io->strip_off = io->ba; + hr_sub_data_offset(vol, &io->strip_off); + io->strip_off %= vol->strip_size / vol->bsize; + io->strip_off *= vol->bsize; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reconstruct_reader, io); + } + } + + stripe->p_count_final = true; + fibril_condvar_broadcast(&stripe->ps_added_cv); + + skip_readers: + + /* parity writer */ + hr_io_raid5_t *io = hr_fgroup_alloc(stripe->worker_group); + io->extent = stripe->p_extent; + io->ba = stripe->total_height[r].start; + io->cnt = stripe->total_height[r].end - + stripe->total_height[r].start + 1; + io->vol = vol; + io->stripe = stripe; + + io->strip_off = io->ba; + hr_sub_data_offset(vol, &io->strip_off); + io->strip_off %= vol->strip_size / vol->bsize; + io->strip_off *= vol->bsize; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_parity_writer, io); + } +} + +static void hr_execute_write_stripe_subtract(hr_stripe_t *stripe, size_t bad) +{ + hr_volume_t *vol = stripe->vol; + + stripe->range_count = hr_stripe_merge_extent_spans(stripe, + vol->extent_no, stripe->total_height); + + size_t worker_cnt; + worker_cnt = stripe->strips_touched; /* writers */ + worker_cnt += stripe->range_count * 2; /* parity readers & writers */ + + stripe->ps_to_be_added = stripe->strips_touched; /* writers */ + stripe->ps_to_be_added += stripe->range_count; /* parity readers */ + stripe->p_count_final = true; + + stripe->worker_group = hr_fgroup_create(vol->fge, worker_cnt); + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == bad || e == stripe->p_extent) + continue; + + if (stripe->extent_span[e].cnt == 0) + continue; + + hr_io_raid5_t *io = hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->data_write = stripe->extent_span[e].data_write; + io->ba = stripe->extent_span[e].range.start; + io->cnt = stripe->extent_span[e].cnt; + io->strip_off = stripe->extent_span[e].strip_off * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_subtract_writer, io); + } + + for (size_t r = 0; r < stripe->range_count; r++) { + hr_io_raid5_t *p_reader = hr_fgroup_alloc(stripe->worker_group); + p_reader->extent = stripe->p_extent; + p_reader->ba = stripe->total_height[r].start; + p_reader->cnt = stripe->total_height[r].end - + stripe->total_height[r].start + 1; + p_reader->vol = vol; + p_reader->stripe = stripe; + + p_reader->strip_off = p_reader->ba; + hr_sub_data_offset(vol, &p_reader->strip_off); + p_reader->strip_off %= vol->strip_size / vol->bsize; + p_reader->strip_off *= vol->bsize; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reconstruct_reader, p_reader); + + hr_io_raid5_t *p_writer = hr_fgroup_alloc(stripe->worker_group); + p_writer->extent = stripe->p_extent; + p_writer->ba = stripe->total_height[r].start; + p_writer->cnt = stripe->total_height[r].end - + stripe->total_height[r].start + 1; + p_writer->vol = vol; + p_writer->stripe = stripe; + + p_writer->strip_off = p_writer->ba; + hr_sub_data_offset(vol, &p_writer->strip_off); + p_writer->strip_off %= vol->strip_size / vol->bsize; + p_writer->strip_off *= vol->bsize; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_parity_writer, p_writer); + } + +} + +static void hr_execute_write_stripe(hr_stripe_t *stripe, size_t bad_extent) +{ + hr_volume_t *vol = stripe->vol; + + if (bad_extent < vol->extent_no) { + hr_execute_write_stripe_degraded(stripe, bad_extent); + return; + } + + if (stripe->subtract) + hr_execute_write_stripe_subtract(stripe, vol->extent_no); + else + hr_execute_write_stripe_optimal_reconstruct(stripe); +} + +static void hr_execute_read_stripe(hr_stripe_t *stripe, size_t bad_extent) +{ + hr_volume_t *vol = stripe->vol; + + /* no parity involved */ + if (bad_extent == vol->extent_no || + bad_extent == stripe->p_extent || + stripe->extent_span[bad_extent].cnt == 0) { + stripe->worker_group = + hr_fgroup_create(vol->fge, stripe->strips_touched); + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == bad_extent || e == stripe->p_extent) + continue; + if (stripe->extent_span[e].cnt == 0) + continue; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->data_read = stripe->extent_span[e].data_read; + io->ba = stripe->extent_span[e].range.start; + io->cnt = stripe->extent_span[e].cnt; + io->strip_off = + stripe->extent_span[e].strip_off * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_basic_reader, io); + } + + return; + } + + /* parity involved */ + + size_t worker_cnt = (vol->extent_no - 2) * 2 + 1; /* upper bound */ + stripe->worker_group = hr_fgroup_create(vol->fge, worker_cnt); + + stripe->ps_to_be_added = 0; + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == bad_extent || e == stripe->p_extent) + continue; + + range_t uncommon = { 0, 0 }; + bool has_uncommon; + has_uncommon = hr_stripe_range_non_extension( + &stripe->extent_span[bad_extent].range, + &stripe->extent_span[e].range, + &uncommon); + + if (stripe->extent_span[e].cnt == 0 || has_uncommon) { + + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + if (stripe->extent_span[bad_extent].cnt == 0) { + io->ba = + stripe->extent_span[bad_extent].range.start; + io->cnt = stripe->extent_span[bad_extent].cnt; + } else { + io->ba = uncommon.start; + io->cnt = uncommon.end - uncommon.start + 1; + } + io->strip_off = + stripe->extent_span[bad_extent].strip_off * + vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reconstruct_reader, io); + + if (stripe->extent_span[e].cnt == 0) + continue; + } + + range_t overlap_range; + bool overlap_up = true; + if (hr_ranges_overlap(&stripe->extent_span[e].range, + &stripe->extent_span[bad_extent].range, + &overlap_range)) { + + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->ba = overlap_range.start; + io->cnt = overlap_range.end - overlap_range.start + 1; + + size_t diff = overlap_range.start - + stripe->extent_span[e].range.start; + io->strip_off = + (stripe->extent_span[e].strip_off + diff) * + vol->bsize; + + io->data_read = stripe->extent_span[e].data_read; + io->data_read += diff * vol->bsize; + if (diff == 0) + overlap_up = false; + + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_reader, io); + } + + bool has_independent; + range_t independent = { 0, 0 }; + has_independent = hr_stripe_range_non_extension( + &stripe->extent_span[e].range, + &uncommon, + &independent); + if (has_independent) { + hr_io_raid5_t *io = + hr_fgroup_alloc(stripe->worker_group); + io->extent = e; + io->ba = independent.start; + io->cnt = independent.end - independent.start + 1; + size_t diff = 0; + if (!overlap_up) { + diff = + overlap_range.end - overlap_range.start + 1; + } + io->strip_off = + (stripe->extent_span[e].strip_off + diff) * + vol->bsize; + io->data_read = stripe->extent_span[e].data_read; + io->data_read += diff * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, + hr_io_raid5_basic_reader, io); + } + } + + stripe->ps_to_be_added++; + + hr_io_raid5_t *io = hr_fgroup_alloc(stripe->worker_group); + io->extent = stripe->p_extent; + io->ba = stripe->extent_span[bad_extent].range.start; + io->cnt = stripe->extent_span[bad_extent].cnt; + io->strip_off = stripe->extent_span[bad_extent].strip_off * vol->bsize; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, hr_io_raid5_reconstruct_reader, + io); + + stripe->p_count_final = true; + fibril_condvar_broadcast(&stripe->ps_added_cv); + + hr_io_raid5_t *pcopier_io = hr_fgroup_alloc(stripe->worker_group); + pcopier_io->cnt = stripe->extent_span[bad_extent].cnt; + pcopier_io->strip_off = + stripe->extent_span[bad_extent].strip_off * vol->bsize; + pcopier_io->data_read = stripe->extent_span[bad_extent].data_read; + pcopier_io->vol = vol; + pcopier_io->stripe = stripe; + + hr_fgroup_submit(stripe->worker_group, hr_io_raid5_parity_getter, + pcopier_io); +} + +/** Get non-overlapping part of 2 ranges. + * + * Return part of @param r1 not in @param r2. + * + * @param r1 Main range. + * @param r2 Queried range. + * @param out Place to store resulting range. + * + * @return true if output range is non-empty, else false. + */ +static bool hr_stripe_range_non_extension(const range_t *r1, const range_t *r2, + range_t *out) +{ + if (r1->end < r2->start) { + *out = *r1; + return true; + } + + if (r1->start > r2->end) { + *out = *r1; + return true; + } + + if (r1->start < r2->start && r1->end >= r2->start) { + out->start = r1->start; + out->end = r2->start - 1; + return out->start <= out->end; + } + + if (r1->start <= r2->end && r1->end > r2->end) { + out->start = r2->end + 1; + out->end = r1->end; + return out->start <= out->end; + } + + return false; +} + +/** Merge adjascent or overlapping extent spans. + * + * @param s Stripe. + * @param extent_no Number of extents. + * @param out Place to store resulting ranges. + * + * @return Number of resulting ranges. + */ +static size_t hr_stripe_merge_extent_spans(hr_stripe_t *s, size_t extent_no, + range_t out[2]) +{ + size_t out_count = 0; + + for (size_t i = 0; i < extent_no; i++) { + if (s->extent_span[i].cnt == 0) + continue; + const range_t *r = &s->extent_span[i].range; + bool merged = false; + + for (size_t j = 0; j < out_count; j++) { + if (hr_ranges_overlap(&out[j], r, NULL)) { + hr_stripe_extend_range(&out[j], r); + merged = true; + + if (out_count == 2 && + hr_ranges_overlap(&out[0], &out[1], NULL)) { + hr_stripe_extend_range(&out[0], &out[1]); + out_count = 1; + } + + break; + } + } + + if (!merged) { + assert(out_count < 2); + out[out_count++] = *r; + } + } + + return out_count; +} + +/** Extend a range. + * + * @param r1 Output range. + * @param r2 Range to extend the output one with. + * + */ +static void hr_stripe_extend_range(range_t *r1, const range_t *r2) +{ + if (r2->start < r1->start) + r1->start = r2->start; + if (r2->end > r1->end) + r1->end = r2->end; +} + +static bool hr_ranges_overlap(const range_t *a, const range_t *b, range_t *out) +{ + uint64_t start = a->start > b->start ? a->start : b->start; + uint64_t end = a->end < b->end ? a->end : b->end; + + if (start <= end) { + if (out != NULL) { + out->start = start; + out->end = end; + } + + return true; + } + + return false; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/parity_stripe.h b/uspace/srv/bd/hr/parity_stripe.h new file mode 100644 index 0000000000..0f907b0a19 --- /dev/null +++ b/uspace/srv/bd/hr/parity_stripe.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_STRIPE_H +#define _HR_STRIPE_H + +#include +#include +#include +#include + +#include "io.h" +#include "var.h" + +typedef struct { + uint64_t start; + uint64_t end; +} range_t; + +typedef struct hr_stripe { + hr_volume_t *vol; + bool write; + bool subtract; + size_t strips_touched; + size_t partial_strips_touched; + struct { + range_t range; + uint64_t cnt; + uint64_t strip_off; + const void *data_write; + void *data_read; + } *extent_span; + uint64_t p_extent; /* parity extent index for this stripe */ + + hr_fgroup_t *worker_group; + + errno_t rc; + bool abort; + bool done; + + fibril_mutex_t parity_lock; + uint8_t *parity; /* the actual parity strip */ + uint64_t parity_size; + + /* parity writers waiting until this many parity commits */ + size_t ps_to_be_added; + size_t ps_added; /* number of parities commited to stripe */ + fibril_condvar_t ps_added_cv; + bool p_count_final; + + /* + * Possibly need 2 ranges because single IO that partially spans + * 2 strips and overflows to second one without creating an adjacent + * range results in parity not being continous. + * + * Example: 2+1 extents, 4 block strip, last extent parity + * + * E0 E1 P + * +----+ +----+ +-----+ + * | | | IO | | IOP | + * |----| |----| |-----| + * | | | | | | + * |----| |----| |-----| + * | | | | | | + * |----| |----| |-----| + * | IO | | | | IOP | + * +----+ +----+ +-----+ + * + * - need 2 parity writers + */ + range_t total_height[2]; /* for knowing writing parity range(s) */ + size_t range_count; +} hr_stripe_t; + +extern hr_stripe_t *hr_create_stripes(hr_volume_t *, uint64_t, size_t, bool); +extern void hr_destroy_stripes(hr_stripe_t *, size_t); +extern void hr_reset_stripe(hr_stripe_t *); +extern void hr_stripe_commit_parity(hr_stripe_t *, uint64_t, const void *, + uint64_t); +extern void hr_stripe_wait_for_parity_commits(hr_stripe_t *); +extern void hr_stripe_parity_abort(hr_stripe_t *); +extern void hr_execute_stripe(hr_stripe_t *, size_t); +extern void hr_wait_for_stripe(hr_stripe_t *); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/raid0.c b/uspace/srv/bd/hr/raid0.c new file mode 100644 index 0000000000..225edc3ed4 --- /dev/null +++ b/uspace/srv/bd/hr/raid0.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io.h" +#include "superblock.h" +#include "util.h" +#include "var.h" + +static errno_t hr_raid0_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t, + void *, const void *, size_t); + +/* bdops */ +static errno_t hr_raid0_bd_open(bd_srvs_t *, bd_srv_t *); +static errno_t hr_raid0_bd_close(bd_srv_t *); +static errno_t hr_raid0_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *, + size_t); +static errno_t hr_raid0_bd_sync_cache(bd_srv_t *, aoff64_t, size_t); +static errno_t hr_raid0_bd_write_blocks(bd_srv_t *, aoff64_t, size_t, + const void *, size_t); +static errno_t hr_raid0_bd_get_block_size(bd_srv_t *, size_t *); +static errno_t hr_raid0_bd_get_num_blocks(bd_srv_t *, aoff64_t *); + +static bd_ops_t hr_raid0_bd_ops = { + .open = hr_raid0_bd_open, + .close = hr_raid0_bd_close, + .sync_cache = hr_raid0_bd_sync_cache, + .read_blocks = hr_raid0_bd_read_blocks, + .write_blocks = hr_raid0_bd_write_blocks, + .get_block_size = hr_raid0_bd_get_block_size, + .get_num_blocks = hr_raid0_bd_get_num_blocks +}; + +extern loc_srv_t *hr_srv; + +errno_t hr_raid0_create(hr_volume_t *new_volume) +{ + HR_DEBUG("%s()", __func__); + + if (new_volume->level != HR_LVL_0) + return EINVAL; + + if (new_volume->extent_no < 2) { + HR_ERROR("RAID 0 volume needs at least 2 devices\n"); + return EINVAL; + } + + hr_raid0_vol_state_eval(new_volume); + if (new_volume->state != HR_VOL_OPTIMAL) { + HR_NOTE("\"%s\": unusable state, not creating\n", + new_volume->devname); + return EINVAL; + } + + bd_srvs_init(&new_volume->hr_bds); + new_volume->hr_bds.ops = &hr_raid0_bd_ops; + new_volume->hr_bds.sarg = new_volume; + + return EOK; +} + +/* + * Called only once in volume's lifetime. + */ +errno_t hr_raid0_init(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + if (vol->level != HR_LVL_0) + return EINVAL; + + uint64_t total_blkno = vol->truncated_blkno * vol->extent_no; + + vol->data_offset = vol->meta_ops->get_data_offset(); + + vol->data_blkno = total_blkno; + /* count md blocks */ + vol->data_blkno -= vol->meta_ops->get_size() * vol->extent_no; + + vol->strip_size = hr_closest_pow2(HR_STRIP_SIZE / vol->extent_no); + + return EOK; +} + +void hr_raid0_vol_state_eval(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + fibril_mutex_lock(&vol->md_lock); + + fibril_mutex_unlock(&vol->md_lock); + + fibril_rwlock_read_lock(&vol->states_lock); + + hr_vol_state_t old_state = vol->state; + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state != HR_EXT_ONLINE) { + fibril_rwlock_read_unlock(&vol->states_lock); + + if (old_state != HR_VOL_FAULTY) { + fibril_rwlock_write_lock(&vol->states_lock); + hr_update_vol_state(vol, HR_VOL_FAULTY); + fibril_rwlock_write_unlock(&vol->states_lock); + } + return; + } + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + if (old_state != HR_VOL_OPTIMAL) { + fibril_rwlock_write_lock(&vol->states_lock); + hr_update_vol_state(vol, HR_VOL_OPTIMAL); + fibril_rwlock_write_unlock(&vol->states_lock); + } +} + +void hr_raid0_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc) +{ + HR_DEBUG("%s()", __func__); + + if (rc == EOK) + return; + + fibril_rwlock_write_lock(&vol->states_lock); + + switch (rc) { + case ENOENT: + hr_update_ext_state(vol, extent, HR_EXT_MISSING); + break; + default: + hr_update_ext_state(vol, extent, HR_EXT_FAILED); + } + + hr_update_vol_state(vol, HR_VOL_FAULTY); + + fibril_rwlock_write_unlock(&vol->states_lock); +} + +static errno_t hr_raid0_bd_open(bd_srvs_t *bds, bd_srv_t *bd) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid0_bd_close(bd_srv_t *bd) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid0_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt) +{ + hr_volume_t *vol = bd->srvs->sarg; + + return hr_sync_extents(vol); +} + +static errno_t hr_raid0_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt, + void *buf, size_t size) +{ + return hr_raid0_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size); +} + +static errno_t hr_raid0_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt, + const void *data, size_t size) +{ + return hr_raid0_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size); +} + +static errno_t hr_raid0_bd_get_block_size(bd_srv_t *bd, size_t *rsize) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rsize = vol->bsize; + return EOK; +} + +static errno_t hr_raid0_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rnb = vol->data_blkno; + return EOK; +} + +static errno_t hr_raid0_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba, + size_t cnt, void *dst, const void *src, size_t size) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + errno_t rc; + uint64_t phys_block, len; + size_t left; + const uint8_t *data_write = src; + uint8_t *data_read = dst; + + if (size < cnt * vol->bsize) + return EINVAL; + + if (vol->vflags & HR_VOL_FLAG_READ_ONLY && type == HR_BD_WRITE) + return ENOTSUP; + + fibril_rwlock_read_lock(&vol->states_lock); + if (vol->state != HR_VOL_OPTIMAL) { + fibril_rwlock_read_unlock(&vol->states_lock); + return EIO; + } + fibril_rwlock_read_unlock(&vol->states_lock); + + rc = hr_check_ba_range(vol, cnt, ba); + if (rc != EOK) + return rc; + + uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ + uint64_t strip_no = ba / strip_size; + uint64_t extent = strip_no % vol->extent_no; + uint64_t stripe = strip_no / vol->extent_no; + uint64_t strip_off = ba % strip_size; + + left = cnt; + + /* calculate how many strips does the IO span */ + size_t end_strip_no = (ba + cnt - 1) / strip_size; + size_t span = end_strip_no - strip_no + 1; + + hr_fgroup_t *group = hr_fgroup_create(vol->fge, span); + + while (left != 0) { + phys_block = stripe * strip_size + strip_off; + cnt = min(left, strip_size - strip_off); + len = vol->bsize * cnt; + hr_add_data_offset(vol, &phys_block); + + hr_io_t *io = hr_fgroup_alloc(group); + io->extent = extent; + io->data_write = data_write; + io->data_read = data_read; + io->ba = phys_block; + io->cnt = cnt; + io->type = type; + io->vol = vol; + + hr_fgroup_submit(group, hr_io_worker, io); + + left -= cnt; + if (left == 0) + break; + + data_read += len; + data_write += len; + + strip_off = 0; + extent++; + if (extent >= vol->extent_no) { + stripe++; + extent = 0; + } + } + + size_t bad; + (void)hr_fgroup_wait(group, NULL, &bad); + + if (bad > 0) + return EIO; + + return EOK; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/raid1.c b/uspace/srv/bd/hr/raid1.c new file mode 100644 index 0000000000..043813478e --- /dev/null +++ b/uspace/srv/bd/hr/raid1.c @@ -0,0 +1,746 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fge.h" +#include "io.h" +#include "superblock.h" +#include "util.h" +#include "var.h" + +static void hr_raid1_vol_state_eval_forced(hr_volume_t *); +static size_t hr_raid1_count_good_w_extents(hr_volume_t *, uint64_t, size_t, + uint64_t); +static errno_t hr_raid1_bd_op(hr_bd_op_type_t, hr_volume_t *, aoff64_t, size_t, + void *, const void *, size_t); +static errno_t hr_raid1_rebuild(void *); + +/* bdops */ +static errno_t hr_raid1_bd_open(bd_srvs_t *, bd_srv_t *); +static errno_t hr_raid1_bd_close(bd_srv_t *); +static errno_t hr_raid1_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *, + size_t); +static errno_t hr_raid1_bd_sync_cache(bd_srv_t *, aoff64_t, size_t); +static errno_t hr_raid1_bd_write_blocks(bd_srv_t *, aoff64_t, size_t, + const void *, size_t); +static errno_t hr_raid1_bd_get_block_size(bd_srv_t *, size_t *); +static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t *, aoff64_t *); + +static bd_ops_t hr_raid1_bd_ops = { + .open = hr_raid1_bd_open, + .close = hr_raid1_bd_close, + .sync_cache = hr_raid1_bd_sync_cache, + .read_blocks = hr_raid1_bd_read_blocks, + .write_blocks = hr_raid1_bd_write_blocks, + .get_block_size = hr_raid1_bd_get_block_size, + .get_num_blocks = hr_raid1_bd_get_num_blocks +}; + +extern loc_srv_t *hr_srv; + +errno_t hr_raid1_create(hr_volume_t *new_volume) +{ + HR_DEBUG("%s()", __func__); + + if (new_volume->level != HR_LVL_1) + return EINVAL; + + if (new_volume->extent_no < 2) { + HR_ERROR("RAID 1 volume needs at least 2 devices\n"); + return EINVAL; + } + + bd_srvs_init(&new_volume->hr_bds); + new_volume->hr_bds.ops = &hr_raid1_bd_ops; + new_volume->hr_bds.sarg = new_volume; + + hr_raid1_vol_state_eval_forced(new_volume); + + fibril_rwlock_read_lock(&new_volume->states_lock); + hr_vol_state_t state = new_volume->state; + fibril_rwlock_read_unlock(&new_volume->states_lock); + if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) { + HR_NOTE("\"%s\": unusable state, not creating\n", + new_volume->devname); + return EINVAL; + } + + return EOK; +} + +/* + * Called only once in volume's lifetime. + */ +errno_t hr_raid1_init(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + if (vol->level != HR_LVL_1) + return EINVAL; + + vol->data_offset = vol->meta_ops->get_data_offset(); + vol->data_blkno = vol->truncated_blkno - vol->meta_ops->get_size(); + vol->strip_size = 0; + + return EOK; +} + +void hr_raid1_vol_state_eval(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + bool exp = true; + if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false)) + return; + + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + + hr_raid1_vol_state_eval_forced(vol); +} + +void hr_raid1_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + if (rc == EOK) + return; + + fibril_rwlock_write_lock(&vol->states_lock); + + switch (rc) { + case ENOENT: + hr_update_ext_state(vol, extent, HR_EXT_MISSING); + break; + default: + hr_update_ext_state(vol, extent, HR_EXT_FAILED); + } + + hr_mark_vol_state_dirty(vol); + + fibril_rwlock_write_unlock(&vol->states_lock); +} + +static void hr_raid1_vol_state_eval_forced(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + fibril_rwlock_read_lock(&vol->extents_lock); + fibril_rwlock_read_lock(&vol->states_lock); + + hr_vol_state_t old_state = vol->state; + size_t healthy = hr_count_extents(vol, HR_EXT_ONLINE); + + size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID); + + size_t rebuild_no = hr_count_extents(vol, HR_EXT_REBUILD); + + fibril_rwlock_read_unlock(&vol->states_lock); + fibril_rwlock_read_unlock(&vol->extents_lock); + + fibril_mutex_lock(&vol->hotspare_lock); + size_t hs_no = vol->hotspare_no; + fibril_mutex_unlock(&vol->hotspare_lock); + + if (healthy == 0) { + if (old_state != HR_VOL_FAULTY) { + fibril_rwlock_write_lock(&vol->states_lock); + hr_update_vol_state(vol, HR_VOL_FAULTY); + fibril_rwlock_write_unlock(&vol->states_lock); + } + } else if (healthy < vol->extent_no) { + if (old_state != HR_VOL_REBUILD && + old_state != HR_VOL_DEGRADED) { + fibril_rwlock_write_lock(&vol->states_lock); + hr_update_vol_state(vol, HR_VOL_DEGRADED); + fibril_rwlock_write_unlock(&vol->states_lock); + } + + if (hs_no > 0 || invalid_no > 0 || rebuild_no > 0) { + fid_t fib = fibril_create(hr_raid1_rebuild, vol); + if (fib == 0) + return; + fibril_start(fib); + fibril_detach(fib); + } + } else { + if (old_state != HR_VOL_OPTIMAL) { + fibril_rwlock_write_lock(&vol->states_lock); + hr_update_vol_state(vol, HR_VOL_OPTIMAL); + fibril_rwlock_write_unlock(&vol->states_lock); + } + } +} + +static errno_t hr_raid1_bd_open(bd_srvs_t *bds, bd_srv_t *bd) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid1_bd_close(bd_srv_t *bd) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid1_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt) +{ + hr_volume_t *vol = bd->srvs->sarg; + + return hr_sync_extents(vol); +} + +static errno_t hr_raid1_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt, + void *buf, size_t size) +{ + hr_volume_t *vol = bd->srvs->sarg; + + return hr_raid1_bd_op(HR_BD_READ, vol, ba, cnt, buf, NULL, size); +} + +static errno_t hr_raid1_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt, + const void *data, size_t size) +{ + hr_volume_t *vol = bd->srvs->sarg; + + if (vol->vflags & HR_VOL_FLAG_READ_ONLY) + return ENOTSUP; + + return hr_raid1_bd_op(HR_BD_WRITE, vol, ba, cnt, NULL, data, size); +} + +static errno_t hr_raid1_bd_get_block_size(bd_srv_t *bd, size_t *rsize) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rsize = vol->bsize; + return EOK; +} + +static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rnb = vol->data_blkno; + return EOK; +} + +static size_t hr_raid1_count_good_w_extents(hr_volume_t *vol, uint64_t ba, + size_t cnt, uint64_t rebuild_blk) +{ + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + assert(fibril_rwlock_is_locked(&vol->states_lock)); + + size_t count = 0; + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_ONLINE || + (vol->extents[i].state == HR_EXT_REBUILD && + rebuild_blk >= ba)) { + count++; + } + } + + return count; +} + +#ifdef HR_RAID1_READ_STRATEGY_SPLIT + +static size_t hr_raid1_count_good_r_extents(hr_volume_t *vol, uint64_t ba, + size_t cnt, uint64_t rebuild_blk) +{ + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + assert(fibril_rwlock_is_locked(&vol->states_lock)); + + size_t count = 0; + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_ONLINE || + (vol->extents[i].state == HR_EXT_REBUILD && + rebuild_blk > ba + cnt - 1)) { + count++; + } + } + + return count; +} + +#endif /* HR_RAID1_READ_STRATEGY_SPLIT */ + +#ifdef HR_RAID1_READ_STRATEGY_CLOSEST + +static size_t get_ext(hr_volume_t *vol, uint64_t ba, size_t cnt, + uint64_t rebuild_blk) +{ + uint64_t closest_e; + uint64_t pos; + uint64_t mdiff = UINT64_MAX; + hr_ext_state_t state = vol->extents[0].state; + if (state != HR_EXT_ONLINE && + (state != HR_EXT_REBUILD || ba + cnt - 1 >= rebuild_blk)) { + closest_e = 1; + } else { + closest_e = 0; + pos = atomic_load_explicit(&vol->last_ext_pos_arr[0], + memory_order_relaxed); + mdiff = (pos > ba) ? pos - ba : ba - pos; + } + + for (size_t e = 1; e < vol->extent_no; e++) { + state = vol->extents[e].state; + if (state != HR_EXT_ONLINE && + (state != HR_EXT_REBUILD || ba + cnt - 1 >= rebuild_blk)) { + continue; + } + + pos = atomic_load_explicit(&vol->last_ext_pos_arr[e], + memory_order_relaxed); + uint64_t diff = (pos > ba) ? pos - ba : ba - pos; + if (diff < mdiff) { + mdiff = diff; + closest_e = e; + } + } + + return closest_e; +} + +#elif defined(HR_RAID1_READ_STRATEGY_ROUND_ROBIN) + +static size_t get_ext(hr_volume_t *vol, uint64_t ba, size_t cnt, + uint64_t rebuild_blk) +{ + size_t last_e; + size_t fail = 0; + + while (true) { + last_e = atomic_fetch_add_explicit(&vol->last_ext_used, 1, + memory_order_relaxed); + last_e %= vol->extent_no; + + hr_ext_state_t state = vol->extents[last_e].state; + if (state != HR_EXT_ONLINE && + (state != HR_EXT_REBUILD || ba + cnt - 1 >= rebuild_blk)) { + if (++fail >= vol->extent_no) + return vol->extent_no; + continue; + } + + break; + } + + return last_e; +} + +#elif defined(HR_RAID1_READ_STRATEGY_FIRST) + +static size_t get_ext(hr_volume_t *vol, uint64_t ba, size_t cnt, + uint64_t rebuild_blk) +{ + for (size_t e = 0; e < vol->extent_no; e++) { + hr_ext_state_t state = vol->extents[e].state; + if (state != HR_EXT_ONLINE && + (state != HR_EXT_REBUILD || ba + cnt - 1 >= rebuild_blk)) { + continue; + } + + return e; + } + return vol->extent_no; +} + +#else + +#if !defined(HR_RAID1_READ_STRATEGY_SPLIT) || \ + !defined(HR_RAID1_READ_STRATEGY_SPLIT_THRESHOLD) +#error "Some RAID 1 read strategy must be used" +#endif + +#endif + +static size_t hr_raid1_read(hr_volume_t *vol, uint64_t ba, size_t cnt, + void *data_read) +{ + uint64_t rebuild_blk; + size_t successful = 0; + +#if !defined(HR_RAID1_READ_STRATEGY_SPLIT) + errno_t rc; + + rebuild_blk = atomic_load_explicit(&vol->rebuild_blk, + memory_order_relaxed); + size_t fail = 0; + while (fail < vol->extent_no) { + fibril_rwlock_read_lock(&vol->states_lock); + size_t best_e = get_ext(vol, ba, cnt, + rebuild_blk); + fibril_rwlock_read_unlock(&vol->states_lock); + if (best_e >= vol->extent_no) + break; + rc = hr_read_direct(vol->extents[best_e].svc_id, ba, + cnt, data_read); + if (rc != EOK) { + hr_raid1_ext_state_cb(vol, best_e, rc); + fail++; + } else { + successful++; + break; + } + } + +#else + +retry_split: + size_t good; + rebuild_blk = atomic_load_explicit(&vol->rebuild_blk, + memory_order_relaxed); + + fibril_rwlock_read_lock(&vol->states_lock); + good = hr_raid1_count_good_r_extents(vol, ba, cnt, + rebuild_blk); + fibril_rwlock_read_unlock(&vol->states_lock); + + if (good == 0) + return 0; + + size_t cnt_per_ext = (cnt + good - 1) / good; + if (cnt_per_ext * vol->bsize < HR_RAID1_READ_STRATEGY_SPLIT_THRESHOLD) + cnt_per_ext = cnt; + + hr_fgroup_t *group = hr_fgroup_create(vol->fge, good); + + fibril_rwlock_read_lock(&vol->states_lock); + size_t left = cnt; + size_t e = 0; + uint8_t *data = data_read; + size_t submitted = 0; + while (left > 0) { + if (e >= vol->extent_no) { + fibril_rwlock_read_unlock(&vol->states_lock); + if (submitted) + (void)hr_fgroup_wait(group, NULL, NULL); + goto retry_split; + } + + hr_ext_state_t state = vol->extents[e].state; + if (state != HR_EXT_ONLINE && + (state != HR_EXT_REBUILD || + ba + cnt - 1 >= rebuild_blk)) { + e++; + continue; + } + + hr_io_t *io = hr_fgroup_alloc(group); + io->extent = e; + io->data_read = data; + io->ba = ba + (cnt - left); + size_t cnt_to_dispatch = min(left, cnt_per_ext); + io->cnt = cnt_to_dispatch; + io->type = HR_BD_READ; + io->vol = vol; + + hr_fgroup_submit(group, hr_io_worker, io); + submitted++; + + data += cnt_to_dispatch * vol->bsize; + left -= cnt_to_dispatch; + e++; + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + (void)hr_fgroup_wait(group, &successful, NULL); + if (successful < submitted) + goto retry_split; + +#endif + + return successful; +} + +static errno_t hr_raid1_bd_op(hr_bd_op_type_t type, hr_volume_t *vol, + aoff64_t ba, size_t cnt, void *data_read, const void *data_write, + size_t size) +{ + HR_DEBUG("%s()", __func__); + + hr_range_lock_t *rl = NULL; + errno_t rc; + uint64_t rebuild_blk; + + if (size < cnt * vol->bsize) + return EINVAL; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_vol_state_t vol_state = vol->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE) + return EIO; + + /* increment metadata counter only on first write */ + bool exp = false; + if (type == HR_BD_WRITE && + atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + } + + rc = hr_check_ba_range(vol, cnt, ba); + if (rc != EOK) + return rc; + + hr_add_data_offset(vol, &ba); + + /* + * extent order has to be locked for the whole IO duration, + * so that workers have consistent targets + */ + fibril_rwlock_read_lock(&vol->extents_lock); + + size_t good; + size_t successful; + switch (type) { + case HR_BD_READ: + successful = hr_raid1_read(vol, ba, cnt, data_read); + break; + case HR_BD_WRITE: + rl = hr_range_lock_acquire(vol, ba, cnt); + + fibril_rwlock_read_lock(&vol->states_lock); + + rebuild_blk = atomic_load_explicit(&vol->rebuild_blk, + memory_order_relaxed); + + good = hr_raid1_count_good_w_extents(vol, ba, cnt, + rebuild_blk); + + hr_fgroup_t *group = hr_fgroup_create(vol->fge, good); + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state != HR_EXT_ONLINE && + (vol->extents[i].state != HR_EXT_REBUILD || + ba > rebuild_blk)) { + /* + * When the extent is being rebuilt, + * we only write to the part that is already + * rebuilt. If IO starts after vol->rebuild_blk + * we do not proceed, the write is going to + * be replicated later in the rebuild. + */ + continue; + } + + hr_io_t *io = hr_fgroup_alloc(group); + io->extent = i; + io->data_write = data_write; + io->ba = ba; + io->cnt = cnt; + io->type = type; + io->vol = vol; + + hr_fgroup_submit(group, hr_io_worker, io); + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + (void)hr_fgroup_wait(group, &successful, NULL); + + hr_range_lock_release(rl); + + break; + default: + assert(0); + } + + if (successful > 0) + rc = EOK; + else + rc = EIO; + + fibril_rwlock_read_unlock(&vol->extents_lock); + + hr_raid1_vol_state_eval(vol); + + return rc; +} + +static errno_t hr_raid1_rebuild(void *arg) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = arg; + void *buf = NULL; + size_t rebuild_idx; + hr_extent_t *rebuild_ext = NULL; + errno_t rc; + + if (vol->vflags & HR_VOL_FLAG_READ_ONLY) + return ENOTSUP; + if (!(vol->meta_ops->get_flags() & HR_METADATA_ALLOW_REBUILD)) + return ENOTSUP; + + rc = hr_init_rebuild(vol, &rebuild_idx); + if (rc != EOK) + return rc; + + rebuild_ext = &vol->extents[rebuild_idx]; + + size_t left = vol->data_blkno - vol->rebuild_blk; + size_t max_blks = DATA_XFER_LIMIT / vol->bsize; + buf = hr_malloc_waitok(max_blks * vol->bsize); + + size_t cnt; + uint64_t ba = vol->rebuild_blk; + hr_add_data_offset(vol, &ba); + + /* + * this is not necessary because a rebuild is + * protected by itself, i.e. there can be only + * one REBUILD at a time + */ + fibril_rwlock_read_lock(&vol->extents_lock); + + /* increment metadata counter only on first write */ + bool exp = false; + if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + } + + hr_range_lock_t *rl = NULL; + + HR_NOTE("\"%s\": REBUILD started on extent no. %zu at " + "block %" PRIu64 ".\n", vol->devname, rebuild_idx, ba); + + uint64_t written = 0; + unsigned int percent, old_percent = 100; + while (left != 0) { + cnt = min(max_blks, left); + + rl = hr_range_lock_acquire(vol, ba, cnt); + + atomic_store_explicit(&vol->rebuild_blk, ba, + memory_order_relaxed); + + rc = hr_raid1_bd_op(HR_BD_READ, vol, ba, cnt, buf, NULL, + cnt * vol->bsize); + if (rc != EOK) { + hr_range_lock_release(rl); + goto end; + } + + rc = hr_write_direct(rebuild_ext->svc_id, ba, cnt, buf); + if (rc != EOK) { + hr_raid1_ext_state_cb(vol, rebuild_idx, rc); + hr_range_lock_release(rl); + goto end; + } + atomic_store_explicit(&vol->last_ext_pos_arr[rebuild_idx], + ba + cnt - 1, memory_order_relaxed); + + percent = ((ba + cnt) * 100) / vol->data_blkno; + if (percent != old_percent) { + if (percent % 5 == 0) + HR_DEBUG("\"%s\" REBUILD progress: %u%%\n", + vol->devname, percent); + } + + if (written * vol->bsize > HR_REBUILD_SAVE_BYTES) { + vol->meta_ops->save_ext(vol, rebuild_idx, + WITH_STATE_CALLBACK); + written = 0; + } + + hr_range_lock_release(rl); + + written += cnt; + ba += cnt; + left -= cnt; + old_percent = percent; + } + + HR_DEBUG("hr_raid1_rebuild(): rebuild finished on \"%s\" (%" PRIun "), " + "extent no. %zu\n", vol->devname, vol->svc_id, rebuild_idx); + + fibril_rwlock_write_lock(&vol->states_lock); + + hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE); + + atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed); + + hr_mark_vol_state_dirty(vol); + + hr_update_vol_state(vol, HR_VOL_DEGRADED); + + fibril_rwlock_write_unlock(&vol->states_lock); +end: + fibril_rwlock_read_unlock(&vol->extents_lock); + + hr_raid1_vol_state_eval(vol); + + free(buf); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/raid5.c b/uspace/srv/bd/hr/raid5.c new file mode 100644 index 0000000000..7046115f0b --- /dev/null +++ b/uspace/srv/bd/hr/raid5.c @@ -0,0 +1,860 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io.h" +#include "parity_stripe.h" +#include "superblock.h" +#include "util.h" +#include "var.h" + +static void hr_raid5_vol_state_eval_forced(hr_volume_t *); +static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t, + uint64_t); +static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t, + size_t); +static errno_t hr_raid5_rebuild(void *); + +/* bdops */ +static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *); +static errno_t hr_raid5_bd_close(bd_srv_t *); +static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *, + size_t); +static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t); +static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t, + const void *, size_t); +static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *); +static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *); + +static bd_ops_t hr_raid5_bd_ops = { + .open = hr_raid5_bd_open, + .close = hr_raid5_bd_close, + .sync_cache = hr_raid5_bd_sync_cache, + .read_blocks = hr_raid5_bd_read_blocks, + .write_blocks = hr_raid5_bd_write_blocks, + .get_block_size = hr_raid5_bd_get_block_size, + .get_num_blocks = hr_raid5_bd_get_num_blocks +}; + +extern loc_srv_t *hr_srv; + +errno_t hr_raid5_create(hr_volume_t *new_volume) +{ + HR_DEBUG("%s()", __func__); + + if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4) + return EINVAL; + + if (new_volume->extent_no < 3) { + HR_ERROR("RAID 5 volume needs at least 3 devices\n"); + return EINVAL; + } + + hr_raid5_vol_state_eval_forced(new_volume); + + fibril_rwlock_read_lock(&new_volume->states_lock); + hr_vol_state_t state = new_volume->state; + fibril_rwlock_read_unlock(&new_volume->states_lock); + if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) { + HR_NOTE("\"%s\": unusable state, not creating\n", + new_volume->devname); + return EINVAL; + } + + bd_srvs_init(&new_volume->hr_bds); + new_volume->hr_bds.ops = &hr_raid5_bd_ops; + new_volume->hr_bds.sarg = new_volume; + + return EOK; +} + +/* + * Called only once in volume's lifetime. + */ +errno_t hr_raid5_init(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4) + return EINVAL; + + vol->data_offset = vol->meta_ops->get_data_offset(); + + uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size(); + vol->data_blkno = single_sz * (vol->extent_no - 1); + + vol->strip_size = hr_closest_pow2(HR_STRIP_SIZE / (vol->extent_no - 1)); + + if (vol->level == HR_LVL_4) + vol->layout = HR_LAYOUT_RAID4_N; + else + vol->layout = HR_LAYOUT_RAID5_NR; + + return EOK; +} + +void hr_raid5_vol_state_eval(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + bool exp = true; + if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false)) + return; + + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + + hr_raid5_vol_state_eval_forced(vol); +} + +void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc) +{ + HR_DEBUG("%s()", __func__); + + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + if (rc == EOK) + return; + + fibril_rwlock_write_lock(&vol->states_lock); + + switch (rc) { + case ENOENT: + hr_update_ext_state(vol, extent, HR_EXT_MISSING); + break; + default: + hr_update_ext_state(vol, extent, HR_EXT_FAILED); + } + + hr_mark_vol_state_dirty(vol); + + fibril_rwlock_write_unlock(&vol->states_lock); +} + +static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd) +{ + HR_DEBUG("%s()\n", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid5_bd_close(bd_srv_t *bd) +{ + HR_DEBUG("%s()\n", __func__); + + hr_volume_t *vol = bd->srvs->sarg; + + atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed); + + return EOK; +} + +static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt) +{ + hr_volume_t *vol = bd->srvs->sarg; + + return hr_sync_extents(vol); +} + +static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt, + void *data_read, size_t size) +{ + hr_volume_t *vol = bd->srvs->sarg; + errno_t rc; + + if (size < cnt * vol->bsize) + return EINVAL; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_vol_state_t vol_state = vol->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE) + return EIO; + + rc = hr_check_ba_range(vol, cnt, ba); + if (rc != EOK) + return rc; + + uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ + uint64_t strip_no = ba / strip_size; + + /* calculate number of stripes touched */ + uint64_t last_ba = ba + cnt - 1; + uint64_t end_strip_no = last_ba / strip_size; + uint64_t start_stripe = strip_no / (vol->extent_no - 1); + uint64_t end_stripe = end_strip_no / (vol->extent_no - 1); + size_t stripes_cnt = end_stripe - start_stripe + 1; + + hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, + stripes_cnt, false); + + uint64_t phys_block, len; + size_t left; + + hr_layout_t layout = vol->layout; + hr_level_t level = vol->level; + + /* parity extent */ + size_t p_extent = hr_raid5_parity_extent(level, layout, + vol->extent_no, strip_no); + + uint64_t strip_off = ba % strip_size; + + left = cnt; + + while (left != 0) { + if (level == HR_LVL_5) { + p_extent = hr_raid5_parity_extent(level, layout, + vol->extent_no, strip_no); + } + + size_t extent = hr_raid5_data_extent(level, layout, + vol->extent_no, strip_no, p_extent); + + uint64_t stripe_no = strip_no / (vol->extent_no - 1); + size_t relative_si = stripe_no - start_stripe; /* relative stripe index */ + hr_stripe_t *stripe = &stripes[relative_si]; + stripe->p_extent = p_extent; + + stripe->strips_touched++; + + phys_block = stripe_no * strip_size + strip_off; + cnt = min(left, strip_size - strip_off); + len = vol->bsize * cnt; + hr_add_data_offset(vol, &phys_block); + + stripe->extent_span[extent].range.start = phys_block; + stripe->extent_span[extent].range.end = phys_block + cnt - 1; + stripe->extent_span[extent].cnt = cnt; + stripe->extent_span[extent].data_read = data_read; + stripe->extent_span[extent].strip_off = strip_off; + + data_read += len; + left -= cnt; + strip_off = 0; + strip_no++; + } + + hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); + + /* + * extent order has to be locked for the whole IO duration, + * so that workers have consistent targets + */ + fibril_rwlock_read_lock(&vol->extents_lock); + + for (uint64_t s = start_stripe; s <= end_stripe; s++) { + uint64_t relative = s - start_stripe; + rlps[relative] = hr_range_lock_acquire(vol, s, 1); + } + +retry: + size_t bad_extent = vol->extent_no; + + uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk, + memory_order_relaxed); + + fibril_rwlock_read_lock(&vol->states_lock); + + for (size_t e = 0; e < vol->extent_no; e++) { + hr_ext_state_t s = vol->extents[e].state; + if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) || + (s == HR_EXT_REBUILD && end_stripe >= rebuild_pos)) { + bad_extent = e; + break; + } + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + for (size_t s = 0; s < stripes_cnt; s++) { + if (stripes[s].done) + continue; + hr_execute_stripe(&stripes[s], bad_extent); + } + + for (size_t s = 0; s < stripes_cnt; s++) { + if (stripes[s].done) + continue; + hr_wait_for_stripe(&stripes[s]); + } + + hr_raid5_vol_state_eval(vol); + + rc = EOK; + + fibril_rwlock_read_lock(&vol->states_lock); + + if (vol->state == HR_VOL_FAULTY) { + fibril_rwlock_read_unlock(&vol->states_lock); + rc = EIO; + goto end; + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + for (size_t s = 0; s < stripes_cnt; s++) + if (stripes[s].rc == EAGAIN) + goto retry; + + /* all stripes are done */ +end: + fibril_rwlock_read_unlock(&vol->extents_lock); + + for (size_t i = 0; i < stripes_cnt; i++) + hr_range_lock_release(rlps[i]); + + free(rlps); + + hr_destroy_stripes(stripes, stripes_cnt); + + return rc; +} + +static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt, + const void *data_write, size_t size) +{ + hr_volume_t *vol = bd->srvs->sarg; + errno_t rc; + + if (size < cnt * vol->bsize) + return EINVAL; + + if (vol->vflags & HR_VOL_FLAG_READ_ONLY) + return ENOTSUP; + + fibril_rwlock_read_lock(&vol->states_lock); + hr_vol_state_t vol_state = vol->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE) + return EIO; + + /* increment metadata counter only on first write */ + bool exp = false; + if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + } + + rc = hr_check_ba_range(vol, cnt, ba); + if (rc != EOK) + return rc; + + uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ + uint64_t strip_no = ba / strip_size; + + /* calculate number of stripes touched */ + uint64_t last_ba = ba + cnt - 1; + uint64_t end_strip_no = last_ba / strip_size; + uint64_t start_stripe = strip_no / (vol->extent_no - 1); + uint64_t end_stripe = end_strip_no / (vol->extent_no - 1); + size_t stripes_cnt = end_stripe - start_stripe + 1; + + hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, + stripes_cnt, true); + + uint64_t stripe_size = strip_size * (vol->extent_no - 1); + + for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) { + uint64_t relative_stripe = stripe - start_stripe; + + uint64_t s_start = stripe * stripe_size; + uint64_t s_end = s_start + stripe_size - 1; + + uint64_t overlap_start; + if (ba > s_start) + overlap_start = ba; + else + overlap_start = s_start; + + uint64_t overlap_end; + if (last_ba < s_end) + overlap_end = last_ba; + else + overlap_end = s_end; + + uint64_t start_strip_index = + (overlap_start - s_start) / strip_size; + uint64_t end_strip_index = (overlap_end - s_start) / strip_size; + size_t strips_touched = end_strip_index - start_strip_index + 1; + + stripes[relative_stripe].strips_touched = strips_touched; + + uint64_t first_offset = (overlap_start - s_start) % strip_size; + uint64_t last_offset = (overlap_end - s_start) % strip_size; + + size_t partials = 0; + if (first_offset != 0) + partials++; + if (last_offset != strip_size - 1) + partials++; + if (start_strip_index == end_strip_index && partials == 2) + partials = 1; + + stripes[relative_stripe].strips_touched = strips_touched; + stripes[relative_stripe].partial_strips_touched = partials; + + if (strips_touched < (vol->extent_no - 1) / 2) + stripes[relative_stripe].subtract = true; + } + + uint64_t phys_block, len; + size_t left; + + hr_layout_t layout = vol->layout; + hr_level_t level = vol->level; + + /* parity extent */ + size_t p_extent = hr_raid5_parity_extent(level, layout, + vol->extent_no, strip_no); + + uint64_t strip_off = ba % strip_size; + + left = cnt; + + while (left != 0) { + if (level == HR_LVL_5) { + p_extent = hr_raid5_parity_extent(level, layout, + vol->extent_no, strip_no); + } + + size_t extent = hr_raid5_data_extent(level, layout, + vol->extent_no, strip_no, p_extent); + + uint64_t stripe_no = strip_no / (vol->extent_no - 1); + size_t relative_si = stripe_no - start_stripe; /* relative stripe index */ + hr_stripe_t *stripe = &stripes[relative_si]; + stripe->p_extent = p_extent; + + phys_block = stripe_no * strip_size + strip_off; + cnt = min(left, strip_size - strip_off); + len = vol->bsize * cnt; + hr_add_data_offset(vol, &phys_block); + + stripe->extent_span[extent].range.start = phys_block; + stripe->extent_span[extent].range.end = phys_block + cnt - 1; + stripe->extent_span[extent].cnt = cnt; + stripe->extent_span[extent].data_write = data_write; + stripe->extent_span[extent].strip_off = strip_off; + + data_write += len; + left -= cnt; + strip_off = 0; + strip_no++; + } + + hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); + + /* + * extent order has to be locked for the whole IO duration, + * so that workers have consistent targets + */ + fibril_rwlock_read_lock(&vol->extents_lock); + + for (uint64_t s = start_stripe; s <= end_stripe; s++) { + uint64_t relative = s - start_stripe; + rlps[relative] = hr_range_lock_acquire(vol, s, 1); + } + +retry: + size_t bad_extent = vol->extent_no; + + uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk, + memory_order_relaxed); + + fibril_rwlock_read_lock(&vol->states_lock); + + for (size_t e = 0; e < vol->extent_no; e++) { + hr_ext_state_t s = vol->extents[e].state; + if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) || + (s == HR_EXT_REBUILD && start_stripe > rebuild_pos)) { + bad_extent = e; + break; + } + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + for (size_t s = 0; s < stripes_cnt; s++) { + if (stripes[s].done) + continue; + hr_execute_stripe(&stripes[s], bad_extent); + } + + for (size_t s = 0; s < stripes_cnt; s++) { + if (stripes[s].done) + continue; + hr_wait_for_stripe(&stripes[s]); + } + + hr_raid5_vol_state_eval(vol); + + rc = EOK; + + fibril_rwlock_read_lock(&vol->states_lock); + + if (vol->state == HR_VOL_FAULTY) { + fibril_rwlock_read_unlock(&vol->states_lock); + rc = EIO; + goto end; + } + + fibril_rwlock_read_unlock(&vol->states_lock); + + for (size_t s = 0; s < stripes_cnt; s++) + if (stripes[s].rc == EAGAIN) + goto retry; + + /* all stripes are done */ +end: + fibril_rwlock_read_unlock(&vol->extents_lock); + + for (size_t i = 0; i < stripes_cnt; i++) + hr_range_lock_release(rlps[i]); + + free(rlps); + + hr_destroy_stripes(stripes, stripes_cnt); + + return rc; +} + +static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rsize = vol->bsize; + return EOK; +} + +static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb) +{ + hr_volume_t *vol = bd->srvs->sarg; + + *rnb = vol->data_blkno; + return EOK; +} + +static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol) +{ + fibril_rwlock_read_lock(&vol->extents_lock); + fibril_rwlock_write_lock(&vol->states_lock); + + hr_vol_state_t state = vol->state; + + size_t bad = 0; + for (size_t i = 0; i < vol->extent_no; i++) + if (vol->extents[i].state != HR_EXT_ONLINE) + bad++; + + size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID); + + size_t rebuild_no = hr_count_extents(vol, HR_EXT_REBUILD); + + fibril_mutex_lock(&vol->hotspare_lock); + size_t hs_no = vol->hotspare_no; + fibril_mutex_unlock(&vol->hotspare_lock); + + switch (bad) { + case 0: + if (state != HR_VOL_OPTIMAL) + hr_update_vol_state(vol, HR_VOL_OPTIMAL); + break; + case 1: + if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD) + hr_update_vol_state(vol, HR_VOL_DEGRADED); + + if (state != HR_VOL_REBUILD) { + if (hs_no > 0 || invalid_no > 0 || rebuild_no > 0) { + fid_t fib = fibril_create(hr_raid5_rebuild, + vol); + if (fib == 0) + break; + fibril_start(fib); + fibril_detach(fib); + } + } + break; + default: + if (state != HR_VOL_FAULTY) + hr_update_vol_state(vol, HR_VOL_FAULTY); + break; + } + + fibril_rwlock_write_unlock(&vol->states_lock); + fibril_rwlock_read_unlock(&vol->extents_lock); +} + +static size_t hr_raid5_parity_extent(hr_level_t level, + hr_layout_t layout, size_t extent_no, uint64_t strip_no) +{ + switch (level) { + case HR_LVL_4: + switch (layout) { + case HR_LAYOUT_RAID4_0: + return (0); + case HR_LAYOUT_RAID4_N: + return (extent_no - 1); + default: + assert(0 && "invalid layout configuration"); + } + case HR_LVL_5: + switch (layout) { + case HR_LAYOUT_RAID5_0R: + return ((strip_no / (extent_no - 1)) % extent_no); + case HR_LAYOUT_RAID5_NR: + case HR_LAYOUT_RAID5_NC: + return ((extent_no - 1) - + (strip_no / (extent_no - 1)) % extent_no); + default: + assert(0 && "invalid layout configuration"); + } + default: + assert(0 && "invalid layout configuration"); + } +} + +static size_t hr_raid5_data_extent(hr_level_t level, + hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent) +{ + switch (level) { + case HR_LVL_4: + switch (layout) { + case HR_LAYOUT_RAID4_0: + return ((strip_no % (extent_no - 1)) + 1); + case HR_LAYOUT_RAID4_N: + return (strip_no % (extent_no - 1)); + default: + assert(0 && "invalid layout configuration"); + } + case HR_LVL_5: + switch (layout) { + case HR_LAYOUT_RAID5_0R: + case HR_LAYOUT_RAID5_NR: + if ((strip_no % (extent_no - 1)) < p_extent) + return (strip_no % (extent_no - 1)); + else + return ((strip_no % (extent_no - 1)) + 1); + case HR_LAYOUT_RAID5_NC: + return (((strip_no % (extent_no - 1)) + p_extent + 1) % + extent_no); + default: + assert(0 && "invalid layout configuration"); + } + default: + assert(0 && "invalid layout configuration"); + } +} + +static errno_t hr_raid5_rebuild(void *arg) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = arg; + errno_t rc = EOK; + size_t rebuild_idx; + + if (vol->vflags & HR_VOL_FLAG_READ_ONLY) + return ENOTSUP; + if (!(vol->meta_ops->get_flags() & HR_METADATA_ALLOW_REBUILD)) + return ENOTSUP; + + rc = hr_init_rebuild(vol, &rebuild_idx); + if (rc != EOK) + return rc; + + uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize; + uint64_t left = + vol->data_blkno / (vol->extent_no - 1) - vol->rebuild_blk; + + uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ + + size_t cnt; + uint64_t ba = vol->rebuild_blk; + hr_add_data_offset(vol, &ba); + + /* + * this is not necessary because a rebuild is + * protected by itself, i.e. there can be only + * one REBUILD at a time + */ + fibril_rwlock_read_lock(&vol->extents_lock); + + /* increment metadata counter only on first write */ + bool exp = false; + if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { + vol->meta_ops->inc_counter(vol); + vol->meta_ops->save(vol, WITH_STATE_CALLBACK); + } + + hr_range_lock_t *rl = NULL; + hr_stripe_t *stripe = hr_create_stripes(vol, max_blks * vol->bsize, 1, + false); + + HR_NOTE("\"%s\": REBUILD started on extent no. %zu at " + "block %" PRIu64 ".\n", + vol->devname, rebuild_idx, ba); + + uint64_t written = 0; + unsigned int percent, old_percent = 100; + while (left != 0) { + cnt = min(left, max_blks); + + uint64_t strip_no = ba / strip_size; + uint64_t last_ba = ba + cnt - 1; + uint64_t end_strip_no = last_ba / strip_size; + uint64_t start_stripe = strip_no / (vol->extent_no - 1); + uint64_t end_stripe = end_strip_no / (vol->extent_no - 1); + size_t stripes_cnt = end_stripe - start_stripe + 1; + + stripe->ps_to_be_added = vol->extent_no - 1; + stripe->p_count_final = true; + + hr_fgroup_t *worker_group = + hr_fgroup_create(vol->fge, vol->extent_no); + + rl = hr_range_lock_acquire(vol, start_stripe, stripes_cnt); + + atomic_store_explicit(&vol->rebuild_blk, ba, + memory_order_relaxed); + + for (size_t e = 0; e < vol->extent_no; e++) { + if (e == rebuild_idx) + continue; + + hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); + io->extent = e; + io->ba = ba; + io->cnt = cnt; + io->strip_off = 0; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(worker_group, + hr_io_raid5_reconstruct_reader, io); + } + + hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); + io->extent = rebuild_idx; + io->ba = ba; + io->cnt = cnt; + io->strip_off = 0; + io->vol = vol; + io->stripe = stripe; + + hr_fgroup_submit(worker_group, hr_io_raid5_parity_writer, io); + + size_t failed; + (void)hr_fgroup_wait(worker_group, NULL, &failed); + if (failed > 0) { + hr_range_lock_release(rl); + HR_NOTE("\"%s\": REBUILD aborted.\n", vol->devname); + goto end; + } + + percent = ((ba + cnt) * 100) / vol->data_blkno; + if (percent != old_percent) { + if (percent % 5 == 0) + HR_DEBUG("\"%s\" REBUILD progress: %u%%\n", + vol->devname, percent); + } + + if (written * vol->bsize > HR_REBUILD_SAVE_BYTES) { + vol->meta_ops->save_ext(vol, rebuild_idx, + WITH_STATE_CALLBACK); + written = 0; + } + + hr_range_lock_release(rl); + hr_reset_stripe(stripe); + + written += cnt; + ba += cnt; + left -= cnt; + old_percent = percent; + + /* + * Let other IO requests be served + * during rebuild. + */ + } + + HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), " + "extent number %zu\n", vol->devname, vol->svc_id, rebuild_idx); + + fibril_rwlock_write_lock(&vol->states_lock); + + hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE); + + atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed); + + hr_mark_vol_state_dirty(vol); + + hr_update_vol_state(vol, HR_VOL_DEGRADED); + + fibril_rwlock_write_unlock(&vol->states_lock); +end: + fibril_rwlock_read_unlock(&vol->extents_lock); + + hr_raid1_vol_state_eval(vol); + + hr_destroy_stripes(stripe, 1); + + return rc; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/superblock.c b/uspace/srv/bd/hr/superblock.c new file mode 100644 index 0000000000..c0893bb833 --- /dev/null +++ b/uspace/srv/bd/hr/superblock.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "superblock.h" +#include "util.h" +#include "var.h" + +#include "metadata/native.h" + +#include "metadata/foreign/geom/g_mirror.h" +#include "metadata/foreign/geom/g_stripe.h" +#include "metadata/foreign/softraid/softraidvar.h" + +extern hr_superblock_ops_t metadata_native_ops; +extern hr_superblock_ops_t metadata_gmirror_ops; +extern hr_superblock_ops_t metadata_gstripe_ops; +extern hr_superblock_ops_t metadata_softraid_ops; +extern hr_superblock_ops_t metadata_md_ops; +extern hr_superblock_ops_t noop_ops; + +static hr_superblock_ops_t *hr_superblock_ops_all[] = { + [HR_METADATA_NATIVE] = &metadata_native_ops, + [HR_METADATA_GEOM_MIRROR] = &metadata_gmirror_ops, + [HR_METADATA_GEOM_STRIPE] = &metadata_gstripe_ops, + [HR_METADATA_SOFTRAID] = &metadata_softraid_ops, + [HR_METADATA_MD] = &metadata_md_ops, + [HR_METADATA_NOOP] = &noop_ops +}; + +hr_superblock_ops_t *hr_get_meta_type_ops(hr_metadata_type_t type) +{ + assert(type >= HR_METADATA_NATIVE && + type < HR_METADATA_LAST_PLACEHOLDER); + + return hr_superblock_ops_all[type]; +} + +errno_t hr_find_metadata(service_id_t svc_id, void **rmetadata, + hr_metadata_type_t *rtype) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + hr_superblock_ops_t *meta_ops; + void *metadata_struct; + + if (rmetadata == NULL) + return EINVAL; + if (rtype == NULL) + return EINVAL; + + volatile hr_metadata_type_t type = HR_METADATA_NATIVE; + for (; type < HR_METADATA_LAST_PLACEHOLDER; type++) { + meta_ops = hr_superblock_ops_all[type]; + + rc = meta_ops->probe(svc_id, &metadata_struct); + if (rc == ENOMEM) + return ENOMEM; + if (rc != EOK) + continue; + + *rmetadata = metadata_struct; + *rtype = type; + return EOK; + } + + return ENOFS; +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/superblock.h b/uspace/srv/bd/hr/superblock.h new file mode 100644 index 0000000000..075e44f109 --- /dev/null +++ b/uspace/srv/bd/hr/superblock.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_SUPERBLOCK_H +#define _HR_SUPERBLOCK_H + +#include "var.h" + +typedef struct hr_volume hr_volume_t; + +#define HR_METADATA_HOTSPARE_SUPPORT 0x01 +#define HR_METADATA_ALLOW_REBUILD 0x02 + +typedef struct hr_superblock_ops { + errno_t (*probe)(service_id_t, void **); + errno_t (*init_vol2meta)(hr_volume_t *); + errno_t (*init_meta2vol)(const list_t *, hr_volume_t *); + errno_t (*erase_block)(service_id_t); + bool (*compare_uuids)(const void *, const void *); + void (*inc_counter)(hr_volume_t *); + errno_t (*save)(hr_volume_t *, bool); + errno_t (*save_ext)(hr_volume_t *, size_t, bool); + const char *(*get_devname)(const void *); + hr_level_t (*get_level)(const void *); + uint64_t (*get_data_offset)(void); + size_t (*get_size)(void); + uint8_t (*get_flags)(void); + void (*dump)(const void *); + hr_metadata_type_t (*get_type)(void); +} hr_superblock_ops_t; + +extern hr_superblock_ops_t *hr_get_meta_type_ops(hr_metadata_type_t); +extern errno_t hr_find_metadata(service_id_t, void **, hr_metadata_type_t *); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/util.c b/uspace/srv/bd/hr/util.c new file mode 100644 index 0000000000..b94508900e --- /dev/null +++ b/uspace/srv/bd/hr/util.c @@ -0,0 +1,1268 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io.h" +#include "superblock.h" +#include "util.h" +#include "var.h" + +static bool hr_range_lock_overlap(hr_range_lock_t *, hr_range_lock_t *); +static errno_t hr_add_svc_linked_to_list(list_t *, service_id_t, bool, void *); +static void free_dev_list_member(struct dev_list_member *); +static void free_svc_id_list(list_t *); +static errno_t hr_fill_disk_part_svcs_list(list_t *); +static errno_t block_init_dev_list(list_t *); +static void block_fini_dev_list(list_t *); +static errno_t hr_util_get_matching_md_svcs_list(list_t *, list_t *, + service_id_t, hr_metadata_type_t, void *); +static errno_t hr_util_assemble_from_matching_list(list_t *, + hr_metadata_type_t, uint8_t); +static errno_t hr_fill_svcs_list_from_cfg(hr_config_t *, list_t *); +static errno_t hr_swap_hs(hr_volume_t *, size_t, size_t); + +#define HR_RL_LIST_LOCK(vol) (fibril_mutex_lock(&(vol)->range_lock_list_lock)) +#define HR_RL_LIST_UNLOCK(vol) \ + (fibril_mutex_unlock(&(vol)->range_lock_list_lock)) + +extern loc_srv_t *hr_srv; +extern list_t hr_volumes; +extern fibril_rwlock_t hr_volumes_lock; + +/* + * malloc() wrapper that behaves like + * FreeBSD malloc(9) with M_WAITOK flag. + * + * Return value is never NULL. + */ +void *hr_malloc_waitok(size_t size) +{ + void *ret; + while ((ret = malloc(size)) == NULL) + fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */ + + return ret; +} + +void *hr_calloc_waitok(size_t nmemb, size_t size) +{ + void *ret; + while ((ret = calloc(nmemb, size)) == NULL) + fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */ + + return ret; +} + +errno_t hr_create_vol_struct(hr_volume_t **rvol, hr_level_t level, + const char *devname, hr_metadata_type_t metadata_type, uint8_t vflags) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + + hr_volume_t *vol = calloc(1, sizeof(hr_volume_t)); + if (vol == NULL) + return ENOMEM; + + str_cpy(vol->devname, HR_DEVNAME_LEN, devname); + vol->level = level; + + vol->vflags = vflags; + + vol->meta_ops = hr_get_meta_type_ops(metadata_type); + + switch (level) { + case HR_LVL_0: + vol->hr_ops.create = hr_raid0_create; + vol->hr_ops.init = hr_raid0_init; + vol->hr_ops.vol_state_eval = hr_raid0_vol_state_eval; + vol->hr_ops.ext_state_cb = hr_raid0_ext_state_cb; + break; + case HR_LVL_1: + vol->hr_ops.create = hr_raid1_create; + vol->hr_ops.init = hr_raid1_init; + vol->hr_ops.vol_state_eval = hr_raid1_vol_state_eval; + vol->hr_ops.ext_state_cb = hr_raid1_ext_state_cb; + break; + case HR_LVL_4: + case HR_LVL_5: + vol->hr_ops.create = hr_raid5_create; + vol->hr_ops.init = hr_raid5_init; + vol->hr_ops.vol_state_eval = hr_raid5_vol_state_eval; + vol->hr_ops.ext_state_cb = hr_raid5_ext_state_cb; + break; + default: + HR_DEBUG("unkown level: %d, aborting\n", vol->level); + rc = EINVAL; + goto error; + } + + if (level == HR_LVL_4 || level == HR_LVL_5) + vol->fge = hr_fpool_create(16, 32, sizeof(hr_io_raid5_t)); + else + vol->fge = hr_fpool_create(16, 32, sizeof(hr_io_t)); + + if (vol->fge == NULL) { + rc = ENOMEM; + goto error; + } + + vol->state = HR_VOL_NONE; + + fibril_mutex_initialize(&vol->md_lock); + + fibril_rwlock_initialize(&vol->extents_lock); + fibril_rwlock_initialize(&vol->states_lock); + + fibril_mutex_initialize(&vol->hotspare_lock); + + list_initialize(&vol->range_lock_list); + fibril_mutex_initialize(&vol->range_lock_list_lock); + + atomic_init(&vol->state_dirty, false); + atomic_init(&vol->first_write, false); + for (size_t i = 0; i < HR_MAX_EXTENTS; i++) + atomic_init(&vol->last_ext_pos_arr[i], 0); + atomic_init(&vol->last_ext_used, 0); + atomic_init(&vol->rebuild_blk, 0); + atomic_init(&vol->open_cnt, 0); + + *rvol = vol; + + return EOK; +error: + free(vol); + return rc; +} + +void hr_destroy_vol_struct(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + if (vol == NULL) + return; + + hr_fpool_destroy(vol->fge); + hr_fini_devs(vol); + free(vol->in_mem_md); + free(vol); +} + +errno_t hr_get_volume_svcs(size_t *rcnt, service_id_t **rsvcs) +{ + size_t i; + service_id_t *vol_svcs; + + if (rcnt == NULL || rsvcs == NULL) + return EINVAL; + + fibril_rwlock_read_lock(&hr_volumes_lock); + + size_t vol_cnt = list_count(&hr_volumes); + vol_svcs = malloc(vol_cnt * sizeof(service_id_t)); + if (vol_svcs == NULL) { + fibril_rwlock_read_unlock(&hr_volumes_lock); + return ENOMEM; + } + + i = 0; + list_foreach(hr_volumes, lvolumes, hr_volume_t, iter) + vol_svcs[i++] = iter->svc_id; + + fibril_rwlock_read_unlock(&hr_volumes_lock); + + *rcnt = vol_cnt; + *rsvcs = vol_svcs; + + return EOK; +} + +hr_volume_t *hr_get_volume(service_id_t svc_id) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *rvol = NULL; + + fibril_rwlock_read_lock(&hr_volumes_lock); + list_foreach(hr_volumes, lvolumes, hr_volume_t, iter) { + if (iter->svc_id == svc_id) { + rvol = iter; + break; + } + } + fibril_rwlock_read_unlock(&hr_volumes_lock); + + return rvol; +} + +errno_t hr_remove_volume(service_id_t svc_id) +{ + HR_DEBUG("%s()", __func__); + + hr_volume_t *vol = hr_get_volume(svc_id); + if (vol == NULL) + return ENOENT; + + fibril_rwlock_write_lock(&hr_volumes_lock); + + int open_cnt = atomic_load_explicit(&vol->open_cnt, + memory_order_relaxed); + + /* + * The atomicity of this if condition (and this whole + * operation) is provided by the write lock - no new + * bd connection can come, because we need to get the + * bd_srvs_t from the volume, which we get from the list. + * (see hr_client_conn() in hr.c) + */ + if (open_cnt > 0) { + fibril_rwlock_write_unlock(&hr_volumes_lock); + return EBUSY; + } + + list_remove(&vol->lvolumes); + + fibril_rwlock_write_unlock(&hr_volumes_lock); + + /* save metadata, but we don't care about states anymore */ + vol->meta_ops->save(vol, NO_STATE_CALLBACK); + + HR_NOTE("deactivating volume \"%s\"\n", vol->devname); + + hr_destroy_vol_struct(vol); + + errno_t rc = loc_service_unregister(hr_srv, svc_id); + return rc; +} + +errno_t hr_init_extents_from_cfg(hr_volume_t *vol, hr_config_t *cfg) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + uint64_t blkno, smallest_blkno = ~0ULL; + size_t i, bsize; + size_t last_bsize = 0; + + for (i = 0; i < cfg->dev_no; i++) { + service_id_t svc_id = cfg->devs[i]; + if (svc_id == 0) { + rc = EINVAL; + goto error; + } + + HR_DEBUG("%s(): block_init() on (%" PRIun ")\n", __func__, + svc_id); + rc = block_init(svc_id); + if (rc != EOK) { + HR_DEBUG("%s(): initing (%" PRIun ") failed, " + "aborting\n", __func__, svc_id); + goto error; + } + + rc = block_get_nblocks(svc_id, &blkno); + if (rc != EOK) + goto error; + + rc = block_get_bsize(svc_id, &bsize); + if (rc != EOK) + goto error; + + if (last_bsize != 0 && bsize != last_bsize) { + HR_DEBUG("block sizes differ\n"); + rc = EINVAL; + goto error; + } + + vol->extents[i].svc_id = svc_id; + vol->extents[i].state = HR_EXT_ONLINE; + + if (blkno < smallest_blkno) + smallest_blkno = blkno; + last_bsize = bsize; + } + + vol->bsize = last_bsize; + vol->extent_no = cfg->dev_no; + vol->truncated_blkno = smallest_blkno; + + for (i = 0; i < HR_MAX_HOTSPARES; i++) + vol->hotspares[i].state = HR_EXT_MISSING; + + return EOK; + +error: + for (i = 0; i < HR_MAX_EXTENTS; i++) { + if (vol->extents[i].svc_id != 0) + block_fini(vol->extents[i].svc_id); + } + + return rc; +} + +void hr_fini_devs(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + size_t i; + + for (i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].svc_id != 0) { + HR_DEBUG("hr_fini_devs(): block_fini() on " + "(%" PRIun ")\n", vol->extents[i].svc_id); + block_fini(vol->extents[i].svc_id); + } + } + + for (i = 0; i < vol->hotspare_no; i++) { + if (vol->hotspares[i].svc_id != 0) { + HR_DEBUG("hr_fini_devs(): block_fini() on " + "(%" PRIun ")\n", + vol->hotspares[i].svc_id); + block_fini(vol->hotspares[i].svc_id); + } + } +} + +errno_t hr_register_volume(hr_volume_t *vol) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + service_id_t new_id; + category_id_t cat_id; + const char *devname = vol->devname; + + rc = loc_service_register(hr_srv, devname, fallback_port_id, &new_id); + if (rc != EOK) { + HR_ERROR("unable to register device \"%s\": %s\n", + devname, str_error(rc)); + return rc; + } + + rc = loc_category_get_id("raid", &cat_id, IPC_FLAG_BLOCKING); + if (rc != EOK) { + HR_ERROR("failed resolving category \"raid\": %s\n", + str_error(rc)); + goto error; + } + + rc = loc_service_add_to_cat(hr_srv, new_id, cat_id); + if (rc != EOK) { + HR_ERROR("failed adding \"%s\" to category \"raid\": %s\n", + devname, str_error(rc)); + goto error; + } + + vol->svc_id = new_id; + return EOK; +error: + rc = loc_service_unregister(hr_srv, new_id); + return rc; +} + +errno_t hr_check_ba_range(hr_volume_t *vol, size_t cnt, uint64_t ba) +{ + if (ba + cnt > vol->data_blkno) + return ERANGE; + return EOK; +} + +void hr_add_data_offset(hr_volume_t *vol, uint64_t *ba) +{ + *ba = *ba + vol->data_offset; +} + +void hr_sub_data_offset(hr_volume_t *vol, uint64_t *ba) +{ + *ba = *ba - vol->data_offset; +} + +void hr_update_ext_state(hr_volume_t *vol, size_t ext_idx, hr_ext_state_t s) +{ + if (vol->level != HR_LVL_0) + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + + assert(fibril_rwlock_is_write_locked(&vol->states_lock)); + + assert(ext_idx < vol->extent_no); + + hr_ext_state_t old = vol->extents[ext_idx].state; + HR_DEBUG("\"%s\": changing extent %zu state: %s -> %s\n", + vol->devname, ext_idx, hr_get_ext_state_str(old), + hr_get_ext_state_str(s)); + vol->extents[ext_idx].state = s; +} + +void hr_update_hotspare_state(hr_volume_t *vol, size_t hs_idx, + hr_ext_state_t s) +{ + assert(fibril_mutex_is_locked(&vol->hotspare_lock)); + + assert(hs_idx < vol->hotspare_no); + + hr_ext_state_t old = vol->hotspares[hs_idx].state; + HR_DEBUG("\"%s\": changing hotspare %zu state: %s -> %s\n", + vol->devname, hs_idx, hr_get_ext_state_str(old), + hr_get_ext_state_str(s)); + vol->hotspares[hs_idx].state = s; +} + +void hr_update_vol_state(hr_volume_t *vol, hr_vol_state_t new) +{ + assert(fibril_rwlock_is_write_locked(&vol->states_lock)); + + HR_NOTE("\"%s\": volume state changed: %s -> %s\n", vol->devname, + hr_get_vol_state_str(vol->state), hr_get_vol_state_str(new)); + vol->state = new; +} + +void hr_update_ext_svc_id(hr_volume_t *vol, size_t ext_idx, service_id_t new) +{ + if (vol->level != HR_LVL_0) + assert(fibril_rwlock_is_write_locked(&vol->extents_lock)); + + assert(ext_idx < vol->extent_no); + + service_id_t old = vol->extents[ext_idx].svc_id; + HR_DEBUG("\"%s\": changing extent no. %zu svc_id: (%" PRIun ") -> " + "(%" PRIun ")\n", vol->devname, ext_idx, old, new); + vol->extents[ext_idx].svc_id = new; +} + +void hr_update_hotspare_svc_id(hr_volume_t *vol, size_t hs_idx, + service_id_t new) +{ + assert(fibril_mutex_is_locked(&vol->hotspare_lock)); + + assert(hs_idx < vol->hotspare_no); + + service_id_t old = vol->hotspares[hs_idx].svc_id; + HR_DEBUG("\"%s\": changing hotspare no. %zu svc_id: (%" PRIun ") -> " + "(%" PRIun ")\n", vol->devname, hs_idx, old, new); + vol->hotspares[hs_idx].svc_id = new; +} + +size_t hr_count_extents(hr_volume_t *vol, hr_ext_state_t state) +{ + if (vol->level != HR_LVL_0) + assert(fibril_rwlock_is_locked(&vol->extents_lock)); + assert(fibril_rwlock_is_locked(&vol->states_lock)); + + size_t count = 0; + for (size_t i = 0; i < vol->extent_no; i++) + if (vol->extents[i].state == state) + count++; + + return count; +} + +hr_range_lock_t *hr_range_lock_acquire(hr_volume_t *vol, uint64_t ba, + uint64_t cnt) +{ + hr_range_lock_t *rl = hr_malloc_waitok(sizeof(hr_range_lock_t)); + + rl->vol = vol; + rl->off = ba; + rl->len = cnt; + + rl->pending = 1; + rl->ignore = false; + + link_initialize(&rl->link); + fibril_mutex_initialize(&rl->lock); + + fibril_mutex_lock(&rl->lock); + +again: + HR_RL_LIST_LOCK(vol); + list_foreach(vol->range_lock_list, link, hr_range_lock_t, rlp) { + if (rlp->ignore) + continue; + if (hr_range_lock_overlap(rlp, rl)) { + rlp->pending++; + + HR_RL_LIST_UNLOCK(vol); + + fibril_mutex_lock(&rlp->lock); + + HR_RL_LIST_LOCK(vol); + + rlp->pending--; + + /* + * when ignore is set, after HR_RL_LIST_UNLOCK(), + * noone new is going to be able to start sleeping + * on the ignored range lock, only already waiting + * IOs will come through here + */ + rlp->ignore = true; + + fibril_mutex_unlock(&rlp->lock); + + if (rlp->pending == 0) { + list_remove(&rlp->link); + free(rlp); + } + + HR_RL_LIST_UNLOCK(vol); + goto again; + } + } + + list_append(&rl->link, &vol->range_lock_list); + + HR_RL_LIST_UNLOCK(vol); + return rl; +} + +void hr_range_lock_release(hr_range_lock_t *rl) +{ + if (rl == NULL) + return; + + HR_RL_LIST_LOCK(rl->vol); + + rl->pending--; + + fibril_mutex_unlock(&rl->lock); + + if (rl->pending == 0) { + list_remove(&rl->link); + free(rl); + } + + HR_RL_LIST_UNLOCK(rl->vol); +} + +static bool hr_range_lock_overlap(hr_range_lock_t *rl1, hr_range_lock_t *rl2) +{ + uint64_t rl1_start = rl1->off; + uint64_t rl1_end = rl1->off + rl1->len - 1; + uint64_t rl2_start = rl2->off; + uint64_t rl2_end = rl2->off + rl2->len - 1; + + /* one ends before the other starts */ + if (rl1_end < rl2_start || rl2_end < rl1_start) + return false; + + return true; +} + +void hr_mark_vol_state_dirty(hr_volume_t *vol) +{ + atomic_store(&vol->state_dirty, true); +} + +static errno_t hr_add_svc_linked_to_list(list_t *list, service_id_t svc_id, + bool inited, void *md) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + struct dev_list_member *to_add; + + if (list == NULL) + return EINVAL; + + to_add = malloc(sizeof(struct dev_list_member)); + if (to_add == NULL) { + rc = ENOMEM; + goto error; + } + + to_add->svc_id = svc_id; + to_add->inited = inited; + to_add->fini = true; + + if (md != NULL) { + to_add->md = md; + to_add->md_present = true; + } else { + to_add->md_present = false; + } + + list_append(&to_add->link, list); + +error: + return rc; +} + +static void free_dev_list_member(struct dev_list_member *p) +{ + HR_DEBUG("%s()", __func__); + + if (p->md_present) + free(p->md); + free(p); +} + +static void free_svc_id_list(list_t *list) +{ + HR_DEBUG("%s()", __func__); + + struct dev_list_member *dev_id; + while (!list_empty(list)) { + dev_id = list_pop(list, struct dev_list_member, link); + + free_dev_list_member(dev_id); + } +} + +static errno_t hr_fill_disk_part_svcs_list(list_t *list) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc; + size_t disk_count; + service_id_t *disk_svcs = NULL; + vbd_t *vbd = NULL; + + rc = vbd_create(&vbd); + if (rc != EOK) + goto error; + + rc = vbd_get_disks(vbd, &disk_svcs, &disk_count); + if (rc != EOK) + goto error; + + for (size_t i = 0; i < disk_count; i++) { + vbd_disk_info_t disk_info; + rc = vbd_disk_info(vbd, disk_svcs[i], &disk_info); + if (rc != EOK) + goto error; + + if (disk_info.ltype != lt_none) { + size_t part_count; + service_id_t *part_ids = NULL; + rc = vbd_label_get_parts(vbd, disk_svcs[i], &part_ids, + &part_count); + if (rc != EOK) + goto error; + + for (size_t j = 0; j < part_count; j++) { + vbd_part_info_t part_info; + rc = vbd_part_get_info(vbd, part_ids[j], + &part_info); + if (rc != EOK) { + free(part_ids); + goto error; + } + + rc = hr_add_svc_linked_to_list(list, + part_info.svc_id, false, NULL); + if (rc != EOK) { + free(part_ids); + goto error; + } + } + + free(part_ids); + + /* + * vbd can detect some bogus label type, but + * no partitions. In that case we handle the + * svc_id as a label-less disk. + * + * This can happen when creating an exfat fs + * in FreeBSD for example. + */ + if (part_count == 0) + disk_info.ltype = lt_none; + } + + if (disk_info.ltype == lt_none) { + rc = hr_add_svc_linked_to_list(list, disk_svcs[i], + false, NULL); + if (rc != EOK) + goto error; + } + } + + free(disk_svcs); + vbd_destroy(vbd); + return EOK; +error: + free_svc_id_list(list); + if (disk_svcs != NULL) + free(disk_svcs); + vbd_destroy(vbd); + + return rc; +} + +static errno_t block_init_dev_list(list_t *list) +{ + HR_DEBUG("%s()", __func__); + + list_foreach_safe(*list, cur_link, next_link) { + struct dev_list_member *iter; + iter = list_get_instance(cur_link, struct dev_list_member, + link); + + if (iter->inited) + continue; + + errno_t rc = block_init(iter->svc_id); + + if (rc == EEXIST) { + list_remove(cur_link); + free_dev_list_member(iter); + continue; + } + + if (rc != EOK) + return rc; + + iter->inited = true; + iter->fini = true; + } + + return EOK; +} + +static void block_fini_dev_list(list_t *list) +{ + HR_DEBUG("%s()", __func__); + + list_foreach(*list, link, struct dev_list_member, iter) { + if (iter->inited && iter->fini) { + block_fini(iter->svc_id); + iter->inited = false; + iter->fini = false; + } + } +} + +static errno_t hr_util_get_matching_md_svcs_list(list_t *rlist, list_t *list, + service_id_t svc_id, hr_metadata_type_t type_main, + void *metadata_struct_main) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + hr_superblock_ops_t *meta_ops = hr_get_meta_type_ops(type_main); + + list_foreach(*list, link, struct dev_list_member, iter) { + if (iter->svc_id == svc_id) + continue; + + void *metadata_struct; + hr_metadata_type_t type; + + rc = hr_find_metadata(iter->svc_id, &metadata_struct, &type); + if (rc == ENOFS) + continue; + if (rc != EOK) + goto error; + + if (type != type_main) { + free(metadata_struct); + continue; + } + + if (!meta_ops->compare_uuids(metadata_struct_main, + metadata_struct)) { + free(metadata_struct); + continue; + } + + rc = hr_add_svc_linked_to_list(rlist, iter->svc_id, true, + metadata_struct); + if (rc != EOK) + goto error; + } + + return EOK; +error: + free_svc_id_list(rlist); + return rc; +} + +static errno_t hr_util_assemble_from_matching_list(list_t *list, + hr_metadata_type_t type, uint8_t vflags) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + hr_superblock_ops_t *meta_ops = hr_get_meta_type_ops(type); + + link_t *memb_l = list_first(list); + struct dev_list_member *memb = list_get_instance(memb_l, + struct dev_list_member, link); + + hr_level_t level = meta_ops->get_level(memb->md); + const char *devname = meta_ops->get_devname(memb->md); + + hr_volume_t *vol; + rc = hr_create_vol_struct(&vol, level, devname, type, vflags); + if (rc != EOK) + return rc; + + meta_ops->init_meta2vol(list, vol); + if (rc != EOK) + goto error; + + rc = vol->hr_ops.create(vol); + if (rc != EOK) + goto error; + + for (size_t e = 0; e < vol->extent_no; e++) { + if (vol->extents[e].svc_id == 0) + continue; + list_foreach(*list, link, struct dev_list_member, iter) { + if (iter->svc_id == vol->extents[e].svc_id) + iter->fini = false; + } + } + + rc = hr_register_volume(vol); + if (rc != EOK) + goto error; + + fibril_rwlock_write_lock(&hr_volumes_lock); + list_append(&vol->lvolumes, &hr_volumes); + fibril_rwlock_write_unlock(&hr_volumes_lock); + + HR_NOTE("assembled volume \"%s\"\n", vol->devname); + + return EOK; +error: + /* let the caller fini the block svc list */ + for (size_t e = 0; e < vol->extent_no; e++) + vol->extents[e].svc_id = 0; + + hr_destroy_vol_struct(vol); + + return rc; +} + +static errno_t hr_fill_svcs_list_from_cfg(hr_config_t *cfg, list_t *list) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + for (size_t i = 0; i < cfg->dev_no; ++i) { + rc = hr_add_svc_linked_to_list(list, cfg->devs[i], false, + NULL); + if (rc != EOK) + goto error; + } + + return EOK; +error: + free_svc_id_list(list); + return rc; +} + +errno_t hr_util_try_assemble(hr_config_t *cfg, size_t *rassembled_cnt) +{ + HR_DEBUG("%s()", __func__); + + /* + * scan partitions or disks: + * + * When we find a metadata block with valid + * magic, take UUID and try to find other matching + * UUIDs. + * + * We ignore extents that are a part of already + * active volumes. (even when the counter is lower + * on active volumes... XXX: use timestamp as initial counter value + * when assembling, or writing dirty metadata?) + */ + + size_t asm_cnt = 0; + errno_t rc; + list_t dev_id_list; + uint8_t vflags = 0; + + list_initialize(&dev_id_list); + + if (cfg == NULL) { + rc = hr_fill_disk_part_svcs_list(&dev_id_list); + } else { + rc = hr_fill_svcs_list_from_cfg(cfg, &dev_id_list); + vflags = cfg->vol_flags; + } + + if (rc != EOK) + goto error; + + rc = block_init_dev_list(&dev_id_list); + if (rc != EOK) + goto error; + + struct dev_list_member *iter; + while (!list_empty(&dev_id_list)) { + iter = list_pop(&dev_id_list, struct dev_list_member, link); + + void *metadata_struct_main; + hr_metadata_type_t type; + + rc = hr_find_metadata(iter->svc_id, &metadata_struct_main, &type); + if (rc == ENOFS) { + block_fini(iter->svc_id); + free_dev_list_member(iter); + rc = EOK; + continue; + } + + if (rc != EOK) { + block_fini(iter->svc_id); + free_dev_list_member(iter); + goto error; + } + + char *svc_name = NULL; + rc = loc_service_get_name(iter->svc_id, &svc_name); + if (rc != EOK) { + block_fini(iter->svc_id); + free_dev_list_member(iter); + goto error; + } + HR_DEBUG("found valid metadata on %s (type = %s), matching " + "other extents\n", + svc_name, hr_get_metadata_type_str(type)); + free(svc_name); + + list_t matching_svcs_list; + list_initialize(&matching_svcs_list); + + rc = hr_util_get_matching_md_svcs_list(&matching_svcs_list, + &dev_id_list, iter->svc_id, type, metadata_struct_main); + if (rc != EOK) { + block_fini(iter->svc_id); + free_dev_list_member(iter); + goto error; + } + + /* add current iter to list as well */ + rc = hr_add_svc_linked_to_list(&matching_svcs_list, + iter->svc_id, true, metadata_struct_main); + if (rc != EOK) { + block_fini(iter->svc_id); + free_svc_id_list(&matching_svcs_list); + goto error; + } + + free_dev_list_member(iter); + + /* remove matching list members from dev_id_list */ + list_foreach(matching_svcs_list, link, struct dev_list_member, + iter2) { + struct dev_list_member *to_remove; + list_foreach_safe(dev_id_list, cur_link, next_link) { + to_remove = list_get_instance(cur_link, + struct dev_list_member, link); + if (to_remove->svc_id == iter2->svc_id) { + list_remove(cur_link); + free_dev_list_member(to_remove); + } + } + } + + rc = hr_util_assemble_from_matching_list(&matching_svcs_list, + type, vflags); + switch (rc) { + case EOK: + asm_cnt++; + break; + case ENOMEM: + goto error; + default: + rc = EOK; + } + block_fini_dev_list(&matching_svcs_list); + free_svc_id_list(&matching_svcs_list); + } + +error: + if (rassembled_cnt != NULL) + *rassembled_cnt = asm_cnt; + + block_fini_dev_list(&dev_id_list); + free_svc_id_list(&dev_id_list); + + return rc; +} + +errno_t hr_util_add_hotspare(hr_volume_t *vol, service_id_t hotspare) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + + fibril_mutex_lock(&vol->hotspare_lock); + + if (vol->hotspare_no >= HR_MAX_HOTSPARES) { + HR_ERROR("%s(): cannot add more hotspares " + "to \"%s\"\n", __func__, vol->devname); + rc = ELIMIT; + goto error; + } + + for (size_t i = 0; i < vol->hotspare_no; i++) { + if (vol->hotspares[i].svc_id == hotspare) { + HR_ERROR("%s(): hotspare (%" PRIun ") already used in " + "%s\n", __func__, hotspare, vol->devname); + rc = EEXIST; + goto error; + } + } + + rc = block_init(hotspare); + if (rc != EOK) + goto error; + + uint64_t hs_blkno; + rc = block_get_nblocks(hotspare, &hs_blkno); + if (rc != EOK) { + block_fini(hotspare); + goto error; + } + + if (hs_blkno < vol->truncated_blkno) { + HR_ERROR("%s(): hotspare (%" PRIun ") doesn't have enough " + "blocks\n", __func__, hotspare); + + rc = EINVAL; + block_fini(hotspare); + goto error; + } + + size_t hs_idx = vol->hotspare_no; + + vol->hotspare_no++; + + hr_update_hotspare_svc_id(vol, hs_idx, hotspare); + hr_update_hotspare_state(vol, hs_idx, HR_EXT_HOTSPARE); + + hr_mark_vol_state_dirty(vol); +error: + fibril_mutex_unlock(&vol->hotspare_lock); + return rc; +} + +void hr_raid5_xor(void *dst, const void *src, size_t size) +{ + size_t i; + uint64_t *d = dst; + const uint64_t *s = src; + + for (i = 0; i < size / sizeof(uint64_t); ++i) + *d++ ^= *s++; +} + +errno_t hr_sync_extents(hr_volume_t *vol) +{ + errno_t rc = EOK; + + fibril_rwlock_read_lock(&vol->extents_lock); + for (size_t e = 0; e < vol->extent_no; e++) { + fibril_rwlock_read_lock(&vol->states_lock); + hr_ext_state_t s = vol->extents[e].state; + fibril_rwlock_read_unlock(&vol->states_lock); + + service_id_t svc_id = vol->extents[e].svc_id; + + if (s == HR_EXT_ONLINE || s == HR_EXT_REBUILD) { + errno_t rc = hr_sync_cache(svc_id, 0, 0); + if (rc != EOK && rc != ENOTSUP) + vol->hr_ops.ext_state_cb(vol, e, rc); + } + } + fibril_rwlock_read_unlock(&vol->extents_lock); + + vol->hr_ops.vol_state_eval(vol); + + fibril_rwlock_read_lock(&vol->states_lock); + hr_vol_state_t s = vol->state; + fibril_rwlock_read_unlock(&vol->states_lock); + + if (s == HR_VOL_FAULTY) + rc = EIO; + + return rc; +} + +errno_t hr_init_rebuild(hr_volume_t *vol, size_t *rebuild_idx) +{ + HR_DEBUG("%s()", __func__); + + errno_t rc = EOK; + size_t bad = vol->extent_no; + + if (vol->level == HR_LVL_0) + return EINVAL; + + fibril_rwlock_read_lock(&vol->states_lock); + if (vol->state != HR_VOL_DEGRADED) { + fibril_rwlock_read_unlock(&vol->states_lock); + return EINVAL; + } + fibril_rwlock_read_unlock(&vol->states_lock); + + fibril_rwlock_write_lock(&vol->extents_lock); + fibril_rwlock_write_lock(&vol->states_lock); + fibril_mutex_lock(&vol->hotspare_lock); + + size_t rebuild = vol->extent_no; + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_REBUILD) { + rebuild = i; + break; + } + } + + if (rebuild < vol->extent_no) { + bad = rebuild; + goto init_rebuild; + } + + size_t invalid = vol->extent_no; + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state == HR_EXT_INVALID) { + invalid = i; + break; + } + } + + if (invalid < vol->extent_no) { + bad = invalid; + goto init_rebuild; + } + + for (size_t i = 0; i < vol->extent_no; i++) { + if (vol->extents[i].state != HR_EXT_ONLINE) { + bad = i; + break; + } + } + + if (bad == vol->extent_no || vol->hotspare_no == 0) { + rc = EINVAL; + goto error; + } + + size_t hotspare_idx = vol->hotspare_no - 1; + + hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state; + if (hs_state != HR_EXT_HOTSPARE) { + HR_ERROR("hr_raid1_rebuild(): invalid hotspare" + "state \"%s\", aborting rebuild\n", + hr_get_ext_state_str(hs_state)); + rc = EINVAL; + goto error; + } + + rc = hr_swap_hs(vol, bad, hotspare_idx); + if (rc != EOK) { + HR_ERROR("hr_raid1_rebuild(): swapping " + "hotspare failed, aborting rebuild\n"); + goto error; + } + + hr_extent_t *rebuild_ext = &vol->extents[bad]; + + HR_DEBUG("hr_raid1_rebuild(): starting REBUILD on extent no. %zu " + "(%" PRIun ")\n", bad, rebuild_ext->svc_id); + +init_rebuild: + hr_update_ext_state(vol, bad, HR_EXT_REBUILD); + hr_update_vol_state(vol, HR_VOL_REBUILD); + + *rebuild_idx = bad; +error: + fibril_mutex_unlock(&vol->hotspare_lock); + fibril_rwlock_write_unlock(&vol->states_lock); + fibril_rwlock_write_unlock(&vol->extents_lock); + + return rc; +} + +static errno_t hr_swap_hs(hr_volume_t *vol, size_t bad, size_t hs) +{ + HR_DEBUG("%s()", __func__); + + service_id_t faulty_svc_id = vol->extents[bad].svc_id; + service_id_t hs_svc_id = vol->hotspares[hs].svc_id; + + hr_update_ext_svc_id(vol, bad, hs_svc_id); + hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE); + + hr_update_hotspare_svc_id(vol, hs, 0); + hr_update_hotspare_state(vol, hs, HR_EXT_MISSING); + + vol->hotspare_no--; + + if (faulty_svc_id != 0) + block_fini(faulty_svc_id); + + return EOK; +} + +uint32_t hr_closest_pow2(uint32_t n) +{ + if (n == 0) + return 0; + + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return n - (n >> 1); +} + +/** @} + */ diff --git a/uspace/srv/bd/hr/util.h b/uspace/srv/bd/hr/util.h new file mode 100644 index 0000000000..45e7ac548e --- /dev/null +++ b/uspace/srv/bd/hr/util.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_UTIL_H +#define _HR_UTIL_H + +#include +#include +#include +#include + +#include "superblock.h" +#include "var.h" + +#define HR_DEBUG(format, ...) \ + log_msg(LOG_DEFAULT, LVL_DEBUG, format, ##__VA_ARGS__) + +#define HR_NOTE(format, ...) \ + log_msg(LOG_DEFAULT, LVL_NOTE, format, ##__VA_ARGS__) + +#define HR_WARN(format, ...) \ + log_msg(LOG_DEFAULT, LVL_WARN, format, ##__VA_ARGS__) + +#define HR_ERROR(format, ...) \ + log_msg(LOG_DEFAULT, LVL_ERROR, format, ##__VA_ARGS__) + +struct dev_list_member { + link_t link; + service_id_t svc_id; + void *md; + bool inited; + bool md_present; + bool fini; +}; + +typedef struct hr_range_lock { + link_t link; + fibril_mutex_t lock; + hr_volume_t *vol; /* back-pointer to volume */ + uint64_t off; /* start of the range */ + uint64_t len; /* length of the range */ + + size_t pending; /* prot. by vol->range_lock_list_lock */ + bool ignore; /* prot. by vol->range_lock_list_lock */ +} hr_range_lock_t; + +extern void *hr_malloc_waitok(size_t) + __attribute__((malloc)); + +extern void *hr_calloc_waitok(size_t, size_t) + __attribute__((malloc)); + +extern errno_t hr_create_vol_struct(hr_volume_t **, hr_level_t, const char *, + hr_metadata_type_t, uint8_t); +extern void hr_destroy_vol_struct(hr_volume_t *); +extern errno_t hr_get_volume_svcs(size_t *, service_id_t **); +extern hr_volume_t *hr_get_volume(service_id_t); +extern errno_t hr_remove_volume(service_id_t); +extern errno_t hr_init_extents_from_cfg(hr_volume_t *, hr_config_t *); +extern void hr_fini_devs(hr_volume_t *); +extern errno_t hr_register_volume(hr_volume_t *); +extern errno_t hr_check_ba_range(hr_volume_t *, size_t, uint64_t); +extern void hr_add_data_offset(hr_volume_t *, uint64_t *); +extern void hr_sub_data_offset(hr_volume_t *, uint64_t *); +extern void hr_update_ext_state(hr_volume_t *, size_t, hr_ext_state_t); +extern void hr_update_hotspare_state(hr_volume_t *, size_t, hr_ext_state_t); +extern void hr_update_vol_state(hr_volume_t *, hr_vol_state_t); +extern void hr_update_ext_svc_id(hr_volume_t *, size_t, service_id_t); +extern void hr_update_hotspare_svc_id(hr_volume_t *, size_t, service_id_t); +extern void hr_sync_all_extents(hr_volume_t *); +extern size_t hr_count_extents(hr_volume_t *, hr_ext_state_t); +extern void hr_mark_vol_state_dirty(hr_volume_t *); +extern hr_range_lock_t *hr_range_lock_acquire(hr_volume_t *, uint64_t, + uint64_t); +extern void hr_range_lock_release(hr_range_lock_t *); +extern errno_t hr_util_try_assemble(hr_config_t *, size_t *); +extern errno_t hr_util_add_hotspare(hr_volume_t *, service_id_t); +extern void hr_raid5_xor(void *, const void *, size_t); +extern errno_t hr_sync_extents(hr_volume_t *); +extern errno_t hr_init_rebuild(hr_volume_t *, size_t *); +extern uint32_t hr_closest_pow2(uint32_t); + +#endif + +/** @} + */ diff --git a/uspace/srv/bd/hr/var.h b/uspace/srv/bd/hr/var.h new file mode 100644 index 0000000000..48f4bec5aa --- /dev/null +++ b/uspace/srv/bd/hr/var.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2025 Miroslav Cimerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * - The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** @addtogroup hr + * @{ + */ +/** + * @file + */ + +#ifndef _HR_VAR_H +#define _HR_VAR_H + +#include +#include +#include +#include +#include +#include + +#include "fge.h" +#include "superblock.h" + +#define NAME "hr" +#define HR_STRIP_SIZE DATA_XFER_LIMIT + +#define HR_RAID1_READ_STRATEGY_SPLIT +#define HR_RAID1_READ_STRATEGY_SPLIT_THRESHOLD (1024 * 1) + +/* #define HR_RAID1_READ_STRATEGY_CLOSEST */ +/* #define HR_RAID1_READ_STRATEGY_ROUND_ROBIN */ +/* #define HR_RAID1_READ_STRATEGY_FIRST */ + +#if !defined(HR_RAID1_READ_STRATEGY_ROUND_ROBIN) && \ + !defined(HR_RAID1_READ_STRATEGY_CLOSEST) && \ + !defined(HR_RAID1_READ_STRATEGY_FIRST) && \ + (!defined(HR_RAID1_READ_STRATEGY_SPLIT) && \ + !defined(HR_RAID1_READ_STRATEGY_SPLIT_THRESHOLD)) +#error "Some RAID 1 read strategy must be used" +#endif + +/* + * During a rebuild operation, we save the rebuild + * position this each many bytes. Currently each + * 10 MiB. + */ +#define HR_REBUILD_SAVE_BYTES (10U * 1024 * 1024) + +struct hr_volume; +typedef struct hr_volume hr_volume_t; +typedef struct hr_stripe hr_stripe_t; +typedef struct hr_metadata hr_metadata_t; +typedef struct hr_superblock_ops hr_superblock_ops_t; + +typedef struct hr_ops { + errno_t (*create)(hr_volume_t *); + errno_t (*init)(hr_volume_t *); + void (*vol_state_eval)(hr_volume_t *); + void (*ext_state_cb)(hr_volume_t *, size_t, errno_t); +} hr_ops_t; + +typedef struct hr_volume { + link_t lvolumes; /* link to all volumes list */ + hr_ops_t hr_ops; /* level init and create fcns */ + bd_srvs_t hr_bds; /* block interface to the vol */ + service_id_t svc_id; /* service id */ + + list_t range_lock_list; /* list of range locks */ + fibril_mutex_t range_lock_list_lock; /* range locks list lock */ + + hr_fpool_t *fge; /* fibril pool */ + + void *in_mem_md; + fibril_mutex_t md_lock; /* lock protecting in_mem_md */ + + hr_superblock_ops_t *meta_ops; + + /* invariants */ + size_t extent_no; /* number of extents */ + size_t bsize; /* block size */ + uint64_t truncated_blkno; /* blkno per extent */ + uint64_t data_blkno; /* no. of user usable blocks */ + uint64_t data_offset; /* user data offset in blocks */ + uint32_t strip_size; /* strip size */ + hr_level_t level; /* volume level */ + hr_layout_t layout; /* RAID Level Qualifier */ + char devname[HR_DEVNAME_LEN]; + + hr_extent_t extents[HR_MAX_EXTENTS]; + fibril_rwlock_t extents_lock; /* extent service id lock */ + + size_t hotspare_no; /* no. of available hotspares */ + hr_extent_t hotspares[HR_MAX_HOTSPARES]; + fibril_mutex_t hotspare_lock; /* lock protecting hotspares */ + + fibril_rwlock_t states_lock; /* states lock */ + + _Atomic bool state_dirty; /* dirty state */ + + /* + * used to increment metadata counter on first write, + * allowing non-destructive read-only access + */ + _Atomic bool first_write; + + _Atomic uint64_t last_ext_pos_arr[HR_MAX_EXTENTS]; + _Atomic uint64_t last_ext_used; + + _Atomic uint64_t rebuild_blk; /* rebuild position */ + _Atomic int open_cnt; /* open/close() counter */ + hr_vol_state_t state; /* volume state */ + uint8_t vflags; +} hr_volume_t; + +typedef enum { + HR_BD_READ, + HR_BD_WRITE +} hr_bd_op_type_t; + +/* macros for hr_metadata_save() */ +#define WITH_STATE_CALLBACK true +#define NO_STATE_CALLBACK false + +extern errno_t hr_raid0_create(hr_volume_t *); +extern errno_t hr_raid1_create(hr_volume_t *); +extern errno_t hr_raid5_create(hr_volume_t *); + +extern errno_t hr_raid0_init(hr_volume_t *); +extern errno_t hr_raid1_init(hr_volume_t *); +extern errno_t hr_raid5_init(hr_volume_t *); + +extern void hr_raid0_vol_state_eval(hr_volume_t *); +extern void hr_raid1_vol_state_eval(hr_volume_t *); +extern void hr_raid5_vol_state_eval(hr_volume_t *); + +extern void hr_raid0_ext_state_cb(hr_volume_t *, size_t, errno_t); +extern void hr_raid1_ext_state_cb(hr_volume_t *, size_t, errno_t); +extern void hr_raid5_ext_state_cb(hr_volume_t *, size_t, errno_t); +#endif + +/** @} + */ diff --git a/uspace/srv/locsrv/locsrv.c b/uspace/srv/locsrv/locsrv.c index 16cc0cbcc6..22a4d6ff6d 100644 --- a/uspace/srv/locsrv/locsrv.c +++ b/uspace/srv/locsrv/locsrv.c @@ -1329,6 +1329,9 @@ static bool loc_init(void) cat = category_new("disk"); categ_dir_add_cat(&cdir, cat); + cat = category_new("raid"); + categ_dir_add_cat(&cdir, cat); + cat = category_new("partition"); categ_dir_add_cat(&cdir, cat); diff --git a/uspace/srv/meson.build b/uspace/srv/meson.build index a8c034a2a3..af6de48d96 100644 --- a/uspace/srv/meson.build +++ b/uspace/srv/meson.build @@ -30,6 +30,7 @@ srvs = [ 'audio/hound', 'bd/file_bd', + 'bd/hr', 'bd/rd', 'bd/vbd', 'clipboard', diff --git a/uspace/srv/system/system.c b/uspace/srv/system/system.c index 5c51e31514..80e8e1bd50 100644 --- a/uspace/srv/system/system.c +++ b/uspace/srv/system/system.c @@ -476,6 +476,7 @@ static errno_t system_startup(void) srv_start("/srv/bd/vbd"); srv_start("/srv/volsrv"); + srv_start("/srv/bd/hr"); init_sysvol();