diff --git a/CHANGELOG.md b/CHANGELOG.md index ca116453..446139d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,10 @@ ### Changed +- Support moving regions to other Pods during graceful shutdown of region servers ([#570]). - Default to OCI for image metadata and product image selection ([#611]). +[#570]: https://github.com/stackabletech/hbase-operator/pull/570 [#598]: https://github.com/stackabletech/hbase-operator/pull/598 [#605]: https://github.com/stackabletech/hbase-operator/pull/605 [#611]: https://github.com/stackabletech/hbase-operator/pull/611 diff --git a/Cargo.lock b/Cargo.lock index 1ec3ecfa..357f166a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -396,9 +396,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -896,9 +896,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -908,9 +908,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -2020,9 +2020,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" [[package]] name = "rustls-webpki" @@ -2043,9 +2043,9 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "schannel" @@ -2181,9 +2181,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.137" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -2235,6 +2235,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shell-escape" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f" + [[package]] name = "shlex" version = "1.3.0" @@ -2339,6 +2345,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "shell-escape", "snafu 0.8.5", "stackable-operator", "strum", @@ -2355,7 +2362,6 @@ dependencies = [ "const_format", "fnv", "futures 0.3.31", - "indoc", "product-config", "rstest", "serde", @@ -2868,9 +2874,9 @@ checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" [[package]] name = "unicode-ident" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-xid" @@ -3146,9 +3152,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310" dependencies = [ "memchr", ] diff --git a/Cargo.nix b/Cargo.nix index e155ff58..f3845f29 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -6963,6 +6963,17 @@ rec { "loom" = [ "dep:loom" ]; }; }; + "shell-escape" = rec { + crateName = "shell-escape"; + version = "0.1.5"; + edition = "2015"; + sha256 = "0kqq83dk0r1fqj4cfzddpxrni2hpz5i1y607g366c4m9iyhngfs5"; + libName = "shell_escape"; + authors = [ + "Steven Fackler " + ]; + + }; "shlex" = rec { crateName = "shlex"; version = "1.3.0"; @@ -7250,6 +7261,10 @@ rec { name = "serde_json"; packageId = "serde_json"; } + { + name = "shell-escape"; + packageId = "shell-escape"; + } { name = "snafu"; packageId = "snafu 0.8.5"; @@ -7321,10 +7336,6 @@ rec { packageId = "futures 0.3.31"; features = [ "compat" ]; } - { - name = "indoc"; - packageId = "indoc"; - } { name = "product-config"; packageId = "product-config"; @@ -7369,6 +7380,10 @@ rec { } ]; devDependencies = [ + { + name = "indoc"; + packageId = "indoc"; + } { name = "rstest"; packageId = "rstest"; diff --git a/Cargo.toml b/Cargo.toml index c90d093e..50c4db7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ rstest = "0.24" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" serde_yaml = "0.9" +shell-escape = "0.1" snafu = "0.8" stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "stackable-operator-0.85.0" } product-config = { git = "https://github.com/stackabletech/product-config.git", tag = "0.7.0" } diff --git a/deploy/helm/hbase-operator/crds/crds.yaml b/deploy/helm/hbase-operator/crds/crds.yaml index 73445bbb..14b43e18 100644 --- a/deploy/helm/hbase-operator/crds/crds.yaml +++ b/deploy/helm/hbase-operator/crds/crds.yaml @@ -688,6 +688,9 @@ spec: description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. nullable: true type: string + hbaseOpts: + nullable: true + type: string hbaseRootdir: nullable: true type: string @@ -775,6 +778,34 @@ spec: nullable: true type: boolean type: object + regionMover: + default: + ack: null + maxThreads: null + runBeforeShutdown: null + description: Before terminating a region server pod, the RegionMover tool can be invoked to transfer local regions to other servers. This may cause a lot of network traffic in the Kubernetes cluster if the entire HBase stacklet is being restarted. The operator will compute a timeout period for the region move that will not exceed the graceful shutdown timeout. + properties: + ack: + description: If enabled (default), the region mover will confirm that regions are available on the source as well as the target pods before and after the move. + nullable: true + type: boolean + additionalMoverOptions: + default: [] + description: Additional options to pass to the region mover. + items: + type: string + type: array + maxThreads: + description: Maximum number of threads to use for moving regions. + format: uint16 + minimum: 0.0 + nullable: true + type: integer + runBeforeShutdown: + description: Move local regions to other servers before terminating a region server's pod. + nullable: true + type: boolean + type: object requestedSecretLifetime: description: Request secret (currently only autoTls certificates) lifetime from the secret operator, e.g. `7d`, or `30d`. Please note that this can be shortened by the `maxCertificateLifetime` setting on the SecretClass issuing the TLS certificate. nullable: true @@ -938,6 +969,9 @@ spec: description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. nullable: true type: string + hbaseOpts: + nullable: true + type: string hbaseRootdir: nullable: true type: string @@ -1025,6 +1059,34 @@ spec: nullable: true type: boolean type: object + regionMover: + default: + ack: null + maxThreads: null + runBeforeShutdown: null + description: Before terminating a region server pod, the RegionMover tool can be invoked to transfer local regions to other servers. This may cause a lot of network traffic in the Kubernetes cluster if the entire HBase stacklet is being restarted. The operator will compute a timeout period for the region move that will not exceed the graceful shutdown timeout. + properties: + ack: + description: If enabled (default), the region mover will confirm that regions are available on the source as well as the target pods before and after the move. + nullable: true + type: boolean + additionalMoverOptions: + default: [] + description: Additional options to pass to the region mover. + items: + type: string + type: array + maxThreads: + description: Maximum number of threads to use for moving regions. + format: uint16 + minimum: 0.0 + nullable: true + type: integer + runBeforeShutdown: + description: Move local regions to other servers before terminating a region server's pod. + nullable: true + type: boolean + type: object requestedSecretLifetime: description: Request secret (currently only autoTls certificates) lifetime from the secret operator, e.g. `7d`, or `30d`. Please note that this can be shortened by the `maxCertificateLifetime` setting on the SecretClass issuing the TLS certificate. nullable: true diff --git a/docs/modules/hbase/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/hbase/pages/usage-guide/operations/graceful-shutdown.adoc index 3eb85373..f58191f2 100644 --- a/docs/modules/hbase/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/hbase/pages/usage-guide/operations/graceful-shutdown.adoc @@ -1,6 +1,6 @@ = Graceful shutdown -You can configure the graceful shutdown as described in xref:concepts:operations/graceful_shutdown.adoc[]. +You can configure the graceful shutdown grace period as described in xref:concepts:operations/graceful_shutdown.adoc[]. == Masters @@ -15,7 +15,7 @@ However, there is no message in the log acknowledging the graceful shutdown. == RegionServers -As a default, RegionServers have `60 minutes` to shut down gracefully. +By default, RegionServers have `60 minutes` to shut down gracefully. They use the same mechanism described above. In contrast to the Master servers, they will, however, acknowledge the graceful shutdown with a message in the logs: @@ -26,6 +26,61 @@ In contrast to the Master servers, they will, however, acknowledge the graceful 2023-10-11 12:38:05,060 INFO [shutdown-hook-0] regionserver.HRegionServer: ***** STOPPING region server 'test-hbase-regionserver-default-0.test-hbase-regionserver-default.kuttl-test-topical-parakeet.svc.cluster.local,16020,1697027870348' ***** ---- +The operator allows for finer control over the shutdown process of region servers. +For each region server pod, the region mover tool may be invoked before terminating the region server's pod. +The affected regions are transferred to other pods thus ensuring that the data is still available. + +Here is an example: + +[source,yaml] +---- +spec: + regionServers: + config: + regionMover: + runBeforeShutdown: true # <1> + maxThreads: 5 # <2> + ack: false # <3> + additionalMoverOptions: ["--designatedFile", "/path/to/designatedFile"] # <4> +---- +<1>: Run the region mover tool before shutting down the region server. Default is `false`. +<2>: Maximum number of threads to use for moving regions. Default is 1. +<3>: Enable or disable region confirmation on the present and target servers. Default is `true`. +<4>: Extra options to pass to the region mover tool. + +For a list of additional options accepted by the region mover use the `--help` option first: + +[source] +---- +$ /stackable/hbase/bin/hbase org.apache.hadoop.hbase.util.RegionMover --help +usage: hbase org.apache.hadoop.hbase.util.RegionMover +Options: + -r,--regionserverhost region server | + -o,--operation Expected: load/unload/unload_from_rack/isolate_regions + -m,--maxthreads Define the maximum number of threads to use to unload and reload the regions + -i,--isolateRegionIds Comma separated list of Region IDs hash to isolate on a RegionServer and put region + server in draining mode. This option should only be used with '-o isolate_regions'. By + putting region server in decommission/draining mode, master can't assign any new region + on this server. If one or more regions are not found OR failed to isolate successfully, + utility will exist without putting RS in draining/decommission mode. Ex. + --isolateRegionIds id1,id2,id3 OR -i id1,id2,id3 + -x,--excludefile File with per line to exclude as unload targets; default excludes only + target host; useful for rack decommisioning. + -d,--designatedfile File with per line as unload targets;default is all online hosts + -f,--filename File to save regions list into unloading, or read from loading; default + /tmp/ + -n,--noack Turn on No-Ack mode(default: false) which won't check if region is online on target + RegionServer, hence best effort. This is more performant in unloading and loading but + might lead to region being unavailable for some time till master reassigns it in case the + move failed + -t,--timeout timeout in seconds after which the tool will exit irrespective of whether it finished or + not;default Integer.MAX_VALUE +---- + +NOTE: There is no need to explicitly specify a timeout for the region movement. The operator will compute an appropriate timeout that cannot exceed the `gracefulShutdownTimeout` for region servers. + +IMPORTANT: The ZooKeeper connection must be available during the time the region mover is running for the graceful shutdown process to succeed. + == RestServers As a default, RestServers have `5 minutes` to shut down gracefully. diff --git a/rust/crd/Cargo.toml b/rust/crd/Cargo.toml index e59e4464..8fec0dba 100644 --- a/rust/crd/Cargo.toml +++ b/rust/crd/Cargo.toml @@ -12,6 +12,7 @@ publish = false product-config.workspace = true serde.workspace = true serde_json.workspace = true +shell-escape.workspace = true snafu.workspace = true stackable-operator.workspace = true strum.workspace = true diff --git a/rust/crd/src/affinity.rs b/rust/crd/src/affinity.rs index aa2f8b59..9edc1548 100644 --- a/rust/crd/src/affinity.rs +++ b/rust/crd/src/affinity.rs @@ -123,13 +123,15 @@ mod tests { replicas: 1 "#; let hbase: HbaseCluster = serde_yaml::from_str(input).expect("illegal test input"); - let merged_config = hbase + let affinity = hbase .merged_config( &role, "default", &hbase.spec.cluster_config.hdfs_config_map_name, ) - .unwrap(); + .unwrap() + .affinity() + .clone(); let mut expected_affinities = vec![WeightedPodAffinityTerm { pod_affinity_term: PodAffinityTerm { @@ -184,7 +186,7 @@ mod tests { }; assert_eq!( - merged_config.affinity, + affinity, StackableAffinity { pod_affinity: Some(PodAffinity { preferred_during_scheduling_ignored_during_execution: Some(expected_affinities), diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 7c90532f..e69f45b7 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -1,7 +1,9 @@ -use std::{collections::BTreeMap, str::FromStr}; +use std::collections::{BTreeMap, HashMap}; +use product_config::types::PropertyNameKind; use security::AuthenticationConfig; use serde::{Deserialize, Serialize}; +use shell_escape::escape; use snafu::{OptionExt, ResultExt, Snafu}; use stackable_operator::{ commons::{ @@ -15,13 +17,17 @@ use stackable_operator::{ }, config::{ fragment::{self, Fragment, ValidationError}, - merge::Merge, + merge::{Atomic, Merge}, + }, + k8s_openapi::{ + api::core::v1::{EnvVar, PodTemplateSpec}, + apimachinery::pkg::api::resource::Quantity, + DeepMerge, }, - k8s_openapi::{api::core::v1::EnvVar, apimachinery::pkg::api::resource::Quantity}, kube::{runtime::reflector::ObjectRef, CustomResource, ResourceExt}, product_config_utils::Configuration, product_logging::{self, spec::Logging}, - role_utils::{GenericRoleConfig, JavaCommonConfig, Role, RoleGroup, RoleGroupRef}, + role_utils::{GenericRoleConfig, JavaCommonConfig, Role, RoleGroupRef}, schemars::{self, JsonSchema}, status::condition::{ClusterCondition, HasStatusCondition}, time::Duration, @@ -35,6 +41,8 @@ pub mod security; pub const APP_NAME: &str = "hbase"; +// This constant is hard coded in hbase-entrypoint.sh +// You need to change it there too. pub const CONFIG_DIR_NAME: &str = "/stackable/conf"; pub const TLS_STORE_DIR: &str = "/stackable/tls"; @@ -72,6 +80,9 @@ pub const HBASE_REST_UI_PORT: u16 = 8085; // Newer versions use the same port as the UI because Hbase provides it's own metrics API pub const METRICS_PORT: u16 = 9100; +const DEFAULT_REGION_MOVER_TIMEOUT: Duration = Duration::from_minutes_unchecked(59); +const DEFAULT_REGION_MOVER_DELTA_TO_SHUTDOWN: Duration = Duration::from_minutes_unchecked(1); + #[derive(Snafu, Debug)] pub enum Error { #[snafu(display("the role [{role}] is invalid and does not exist in HBase"))] @@ -83,11 +94,17 @@ pub enum Error { #[snafu(display("the HBase role [{role}] is missing from spec"))] MissingHbaseRole { role: String }, - #[snafu(display("the HBase role group [{role_group}] is missing from spec"))] - MissingHbaseRoleGroup { role_group: String }, - #[snafu(display("fragment validation failure"))] FragmentValidationFailure { source: ValidationError }, + + #[snafu(display("object defines no master role"))] + NoMasterRole, + + #[snafu(display("object defines no regionserver role"))] + NoRegionServerRole, + + #[snafu(display("incompatible merge types"))] + IncompatibleMergeTypes, } /// An HBase cluster stacklet. This resource is managed by the Stackable operator for Apache HBase. @@ -130,7 +147,8 @@ pub struct HbaseClusterSpec { /// Region servers hold the data and handle requests from clients for their region. #[serde(default, skip_serializing_if = "Option::is_none")] - pub region_servers: Option>, + pub region_servers: + Option>, /// Rest servers provide a REST API to interact with. #[serde(default, skip_serializing_if = "Option::is_none")] @@ -331,19 +349,118 @@ impl HbaseRole { HbaseRole::RestServer => "rest".to_string(), } } +} - /// We could have different service names depended on the role (e.g. "hbase-master", "hbase-regionserver" and - /// "hbase-restserver"). However this produces error messages such as - /// [RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020] security.ShellBasedUnixGroupsMapping: unable to return groups for user hbase-master PartialGroupNameException The user name 'hbase-master' is not found. id: 'hbase-master': no such user - /// or - /// Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.security.AccessDeniedException): org.apache.hadoop.hbase.security.AccessDeniedException: Insufficient permissions (user=hbase-master/hbase-master-default-1.hbase-master-default.kuttl-test-poetic-sunbeam.svc.cluster.local@CLUSTER.LOCAL, scope=hbase:meta, family=table:state, params=[table=hbase:meta,family=table:state],action=WRITE) - /// - /// Also the documentation states: - /// > A Kerberos principal has three parts, with the form username/fully.qualified.domain.name@YOUR-REALM.COM. We recommend using hbase as the username portion. - /// - /// As a result we use "hbase" everywhere (which e.g. differs from the current hdfs implementation) - pub fn kerberos_service_name(&self) -> &'static str { - "hbase" +fn default_resources(role: &HbaseRole) -> ResourcesFragment { + match role { + HbaseRole::RegionServer => ResourcesFragment { + cpu: CpuLimitsFragment { + min: Some(Quantity("250m".to_owned())), + max: Some(Quantity("1".to_owned())), + }, + memory: MemoryLimitsFragment { + limit: Some(Quantity("1Gi".to_owned())), + runtime_limits: NoRuntimeLimitsFragment {}, + }, + storage: HbaseStorageConfigFragment {}, + }, + HbaseRole::RestServer => ResourcesFragment { + cpu: CpuLimitsFragment { + min: Some(Quantity("100m".to_owned())), + max: Some(Quantity("400m".to_owned())), + }, + memory: MemoryLimitsFragment { + limit: Some(Quantity("512Mi".to_owned())), + runtime_limits: NoRuntimeLimitsFragment {}, + }, + storage: HbaseStorageConfigFragment {}, + }, + HbaseRole::Master => ResourcesFragment { + cpu: CpuLimitsFragment { + min: Some(Quantity("250m".to_owned())), + max: Some(Quantity("1".to_owned())), + }, + memory: MemoryLimitsFragment { + limit: Some(Quantity("1Gi".to_owned())), + runtime_limits: NoRuntimeLimitsFragment {}, + }, + storage: HbaseStorageConfigFragment {}, + }, + } +} + +#[derive(Debug, Clone)] +enum AnyConfigFragment { + RegionServer(RegionServerConfigFragment), + RestServer(HbaseConfigFragment), + Master(HbaseConfigFragment), +} + +impl AnyConfigFragment { + fn merge(self, other: &AnyConfigFragment) -> Result { + match (self, other) { + (AnyConfigFragment::RegionServer(mut me), AnyConfigFragment::RegionServer(you)) => { + me.merge(you); + Ok(AnyConfigFragment::RegionServer(me.clone())) + } + (AnyConfigFragment::RestServer(mut me), AnyConfigFragment::RestServer(you)) => { + me.merge(you); + Ok(AnyConfigFragment::RestServer(me.clone())) + } + (AnyConfigFragment::Master(mut me), AnyConfigFragment::Master(you)) => { + me.merge(you); + Ok(AnyConfigFragment::Master(me.clone())) + } + (_, _) => Err(Error::IncompatibleMergeTypes), + } + } + + fn default_for( + role: &HbaseRole, + cluster_name: &str, + hdfs_discovery_cm_name: &str, + ) -> AnyConfigFragment { + match role { + HbaseRole::RegionServer => { + AnyConfigFragment::RegionServer(RegionServerConfigFragment { + hbase_rootdir: None, + hbase_opts: None, + resources: default_resources(role), + logging: product_logging::spec::default_logging(), + affinity: get_affinity(cluster_name, role, hdfs_discovery_cm_name), + graceful_shutdown_timeout: Some( + HbaseRole::DEFAULT_REGION_SERVER_GRACEFUL_SHUTDOWN_TIMEOUT, + ), + region_mover: RegionMoverFragment { + run_before_shutdown: Some(false), + max_threads: Some(1), + ack: Some(true), + cli_opts: None, + }, + requested_secret_lifetime: Some(HbaseRole::DEFAULT_REGION_SECRET_LIFETIME), + }) + } + HbaseRole::RestServer => AnyConfigFragment::RestServer(HbaseConfigFragment { + hbase_rootdir: None, + resources: default_resources(role), + logging: product_logging::spec::default_logging(), + affinity: get_affinity(cluster_name, role, hdfs_discovery_cm_name), + graceful_shutdown_timeout: Some( + HbaseRole::DEFAULT_REST_SERVER_GRACEFUL_SHUTDOWN_TIMEOUT, + ), + requested_secret_lifetime: Some(HbaseRole::DEFAULT_REST_SECRET_LIFETIME), + }), + HbaseRole::Master => AnyConfigFragment::Master(HbaseConfigFragment { + hbase_rootdir: None, + resources: default_resources(role), + logging: product_logging::spec::default_logging(), + affinity: get_affinity(cluster_name, role, hdfs_discovery_cm_name), + graceful_shutdown_timeout: Some( + HbaseRole::DEFAULT_MASTER_GRACEFUL_SHUTDOWN_TIMEOUT, + ), + requested_secret_lifetime: Some(HbaseRole::DEFAULT_MASTER_SECRET_LIFETIME), + }), + } } } @@ -492,6 +609,159 @@ impl Configuration for HbaseConfigFragment { } } +#[derive(Fragment, Clone, Debug, JsonSchema, PartialEq, Serialize, Deserialize)] +#[fragment_attrs( + derive( + Clone, + Debug, + Default, + Deserialize, + Merge, + JsonSchema, + PartialEq, + Serialize + ), + serde(rename_all = "camelCase") +)] +pub struct RegionMover { + /// Move local regions to other servers before terminating a region server's pod. + run_before_shutdown: bool, + + /// Maximum number of threads to use for moving regions. + max_threads: u16, + + /// If enabled (default), the region mover will confirm that regions are available on the + /// source as well as the target pods before and after the move. + ack: bool, + + #[fragment_attrs(serde(flatten))] + cli_opts: Option, +} + +#[derive(Clone, Debug, Eq, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "camelCase")] +#[schemars(deny_unknown_fields)] +pub struct RegionMoverExtraCliOpts { + /// Additional options to pass to the region mover. + #[serde(default)] + pub additional_mover_options: Vec, +} + +impl Atomic for RegionMoverExtraCliOpts {} + +#[derive(Clone, Debug, Fragment, JsonSchema, PartialEq)] +#[fragment_attrs( + derive( + Clone, + Debug, + Default, + Deserialize, + Merge, + JsonSchema, + PartialEq, + Serialize + ), + serde(rename_all = "camelCase") +)] +pub struct RegionServerConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hbase_rootdir: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hbase_opts: Option, + #[fragment_attrs(serde(default))] + pub resources: Resources, + #[fragment_attrs(serde(default))] + pub logging: Logging, + #[fragment_attrs(serde(default))] + pub affinity: StackableAffinity, + + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, + + /// Request secret (currently only autoTls certificates) lifetime from the secret operator, e.g. `7d`, or `30d`. + /// Please note that this can be shortened by the `maxCertificateLifetime` setting on the SecretClass issuing the TLS certificate. + #[fragment_attrs(serde(default))] + pub requested_secret_lifetime: Option, + + /// Before terminating a region server pod, the RegionMover tool can be invoked to transfer + /// local regions to other servers. + /// This may cause a lot of network traffic in the Kubernetes cluster if the entire HBase stacklet is being + /// restarted. + /// The operator will compute a timeout period for the region move that will not exceed the graceful shutdown timeout. + #[fragment_attrs(serde(default))] + pub region_mover: RegionMover, +} + +impl Configuration for RegionServerConfigFragment { + type Configurable = HbaseCluster; + + fn compute_env( + &self, + _resource: &Self::Configurable, + _role_name: &str, + ) -> Result>, stackable_operator::product_config_utils::Error> + { + let mut vars: BTreeMap> = BTreeMap::new(); + + vars.insert( + "HBASE_CONF_DIR".to_string(), + Some(CONFIG_DIR_NAME.to_string()), + ); + // required by phoenix (for cases where Kerberos is enabled): see https://issues.apache.org/jira/browse/PHOENIX-2369 + vars.insert( + "HADOOP_CONF_DIR".to_string(), + Some(CONFIG_DIR_NAME.to_string()), + ); + Ok(vars) + } + + fn compute_cli( + &self, + _resource: &Self::Configurable, + _role_name: &str, + ) -> Result>, stackable_operator::product_config_utils::Error> + { + Ok(BTreeMap::new()) + } + + fn compute_files( + &self, + _resource: &Self::Configurable, + _role_name: &str, + file: &str, + ) -> Result>, stackable_operator::product_config_utils::Error> + { + let mut result = BTreeMap::new(); + + match file { + HBASE_ENV_SH => { + // The contents of this file cannot be built entirely here because we don't have + // access to the clusterConfig or product version. + // These are needed to set up Kerberos and JMX exporter settings. + // To avoid fragmentation of the code needed to build this file, we moved the + // implementation to the hbase_controller::build_hbase_env_sh() function. + } + HBASE_SITE_XML => { + result.insert( + HBASE_CLUSTER_DISTRIBUTED.to_string(), + Some("true".to_string()), + ); + result.insert( + HBASE_UNSAFE_REGIONSERVER_HOSTNAME_DISABLE_MASTER_REVERSEDNS.to_string(), + Some("true".to_string()), + ); + result.insert(HBASE_ROOTDIR.to_string(), self.hbase_rootdir.clone()); + } + _ => {} + } + + result.retain(|_, maybe_value| maybe_value.is_some()); + + Ok(result) + } +} + #[derive(Clone, Debug, Default, Deserialize, Eq, JsonSchema, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] pub struct HbaseClusterStatus { @@ -509,6 +779,247 @@ impl HasStatusCondition for HbaseCluster { } impl HbaseCluster { + /// Retrieve and merge resource configs for role and role groups + pub fn merged_config( + &self, + role: &HbaseRole, + role_group: &str, + hdfs_discovery_cm_name: &str, + ) -> Result { + // Initialize the result with all default values as baseline + let defaults = + AnyConfigFragment::default_for(role, &self.name_any(), hdfs_discovery_cm_name); + + let (mut role_config, mut role_group_config) = match role { + HbaseRole::RegionServer => { + let role = self + .spec + .region_servers + .clone() + .context(MissingHbaseRoleSnafu { + role: role.to_string(), + })?; + + let role_config = role.config.config.to_owned(); + let role_group_config = role + .role_groups + .get(role_group) + .map(|rg| rg.config.config.clone()) + .unwrap_or_default(); + + ( + AnyConfigFragment::RegionServer(role_config), + AnyConfigFragment::RegionServer(role_group_config), + ) + } + HbaseRole::RestServer => { + let role = self + .spec + .rest_servers + .clone() + .context(MissingHbaseRoleSnafu { + role: role.to_string(), + })?; + + let role_config = role.config.config.to_owned(); + + let role_group_config = role + .role_groups + .get(role_group) + .map(|rg| rg.config.config.clone()) + .unwrap_or_default(); + + // Retrieve role resource config + ( + AnyConfigFragment::RestServer(role_config), + AnyConfigFragment::RestServer(role_group_config), + ) + } + HbaseRole::Master => { + let role = self.spec.masters.clone().context(MissingHbaseRoleSnafu { + role: role.to_string(), + })?; + + let role_config = role.config.config.to_owned(); + + // Retrieve rolegroup specific resource config + let role_group_config = role + .role_groups + .get(role_group) + .map(|rg| rg.config.config.clone()) + .unwrap_or_default(); + + // Retrieve role resource config + ( + AnyConfigFragment::Master(role_config), + AnyConfigFragment::Master(role_group_config), + ) + } + }; + + // Merge more specific configs into default config + // Hierarchy is: + // 1. RoleGroup + // 2. Role + // 3. Default + role_config = role_config.merge(&defaults)?; + role_group_config = role_group_config.merge(&role_config)?; + + tracing::debug!("Merged config: {:?}", role_group_config); + + Ok(match role_group_config { + AnyConfigFragment::RegionServer(conf) => AnyServiceConfig::RegionServer( + fragment::validate(conf).context(FragmentValidationFailureSnafu)?, + ), + AnyConfigFragment::RestServer(conf) => AnyServiceConfig::RestServer( + fragment::validate(conf).context(FragmentValidationFailureSnafu)?, + ), + AnyConfigFragment::Master(conf) => AnyServiceConfig::Master( + fragment::validate(conf).context(FragmentValidationFailureSnafu)?, + ), + }) + } + + // The result type is only defined once, there is no value in extracting it into a type definition. + #[allow(clippy::type_complexity)] + pub fn build_role_properties( + &self, + ) -> Result< + HashMap< + String, + ( + Vec, + Role< + impl Configuration, + GenericRoleConfig, + JavaCommonConfig, + >, + ), + >, + Error, + > { + let config_types = vec![ + PropertyNameKind::Env, + PropertyNameKind::File(HBASE_ENV_SH.to_string()), + PropertyNameKind::File(HBASE_SITE_XML.to_string()), + PropertyNameKind::File(SSL_SERVER_XML.to_string()), + PropertyNameKind::File(SSL_CLIENT_XML.to_string()), + PropertyNameKind::File(JVM_SECURITY_PROPERTIES_FILE.to_string()), + ]; + + let mut roles = HashMap::from([( + HbaseRole::Master.to_string(), + ( + config_types.to_owned(), + self.spec + .masters + .clone() + .context(NoMasterRoleSnafu)? + .erase(), + ), + )]); + roles.insert( + HbaseRole::RegionServer.to_string(), + ( + config_types.to_owned(), + self.spec + .region_servers + .clone() + .context(NoRegionServerRoleSnafu)? + .erase(), + ), + ); + + if let Some(rest_servers) = self.spec.rest_servers.as_ref() { + roles.insert( + HbaseRole::RestServer.to_string(), + (config_types, rest_servers.to_owned().erase()), + ); + } + + Ok(roles) + } + + pub fn merge_pod_overrides( + &self, + pod_template: &mut PodTemplateSpec, + role: &HbaseRole, + role_group_ref: &RoleGroupRef, + ) { + let (role_pod_overrides, role_group_pod_overrides) = match role { + HbaseRole::Master => ( + self.spec + .masters + .as_ref() + .map(|r| r.config.pod_overrides.clone()), + self.spec + .masters + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .map(|r| r.config.pod_overrides.clone()), + ), + HbaseRole::RegionServer => ( + self.spec + .region_servers + .as_ref() + .map(|r| r.config.pod_overrides.clone()), + self.spec + .region_servers + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .map(|r| r.config.pod_overrides.clone()), + ), + HbaseRole::RestServer => ( + self.spec + .rest_servers + .as_ref() + .map(|r| r.config.pod_overrides.clone()), + self.spec + .rest_servers + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .map(|r| r.config.pod_overrides.clone()), + ), + }; + + if let Some(rpo) = role_pod_overrides { + pod_template.merge_from(rpo); + } + if let Some(rgpo) = role_group_pod_overrides { + pod_template.merge_from(rgpo); + } + } + + pub fn replicas( + &self, + hbase_role: &HbaseRole, + role_group_ref: &RoleGroupRef, + ) -> Option { + match hbase_role { + HbaseRole::Master => self + .spec + .masters + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .and_then(|rg| rg.replicas) + .map(i32::from), + HbaseRole::RegionServer => self + .spec + .region_servers + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .and_then(|rg| rg.replicas) + .map(i32::from), + HbaseRole::RestServer => self + .spec + .rest_servers + .as_ref() + .and_then(|r| r.role_groups.get(&role_group_ref.role_group)) + .and_then(|rg| rg.replicas) + .map(i32::from), + } + } + /// The name of the role-level load-balanced Kubernetes `Service` pub fn server_role_service_name(&self) -> Option { self.metadata.name.clone() @@ -527,38 +1038,6 @@ impl HbaseCluster { } } - pub fn get_role( - &self, - role: &HbaseRole, - ) -> Option<&Role> { - match role { - HbaseRole::Master => self.spec.masters.as_ref(), - HbaseRole::RegionServer => self.spec.region_servers.as_ref(), - HbaseRole::RestServer => self.spec.rest_servers.as_ref(), - } - } - - /// Get the RoleGroup struct for the given ref - pub fn get_role_group( - &self, - rolegroup_ref: &RoleGroupRef, - ) -> Result<&RoleGroup, Error> { - let role_variant = - HbaseRole::from_str(&rolegroup_ref.role).with_context(|_| InvalidRoleSnafu { - role: rolegroup_ref.role.to_owned(), - })?; - let role = self - .get_role(&role_variant) - .with_context(|| MissingHbaseRoleSnafu { - role: role_variant.to_string(), - })?; - role.role_groups - .get(&rolegroup_ref.role_group) - .with_context(|| MissingHbaseRoleGroupSnafu { - role_group: rolegroup_ref.role_group.to_owned(), - }) - } - pub fn role_config(&self, role: &HbaseRole) -> Option<&GenericRoleConfig> { match role { HbaseRole::Master => self.spec.masters.as_ref().map(|m| &m.role_config), @@ -629,6 +1108,14 @@ impl HbaseCluster { } } + pub fn service_port(&self, role: &HbaseRole) -> u16 { + match role { + HbaseRole::Master => HBASE_MASTER_PORT, + HbaseRole::RegionServer => HBASE_REGIONSERVER_PORT, + HbaseRole::RestServer => HBASE_REST_PORT, + } + } + /// Name of the port used by the Web UI, which depends on HTTPS usage fn ui_port_name(&self) -> String { if self.has_https_enabled() { @@ -638,77 +1125,135 @@ impl HbaseCluster { } .to_string() } - - /// Retrieve and merge resource configs for role and role groups - pub fn merged_config( - &self, - role: &HbaseRole, - role_group: &str, - hdfs_discovery_cm_name: &str, - ) -> Result { - // Initialize the result with all default values as baseline - let conf_defaults = role.default_config(&self.name_any(), hdfs_discovery_cm_name); - - let role = self.get_role(role).context(MissingHbaseRoleSnafu { - role: role.to_string(), - })?; - - // Retrieve role resource config - let mut conf_role = role.config.config.to_owned(); - - // Retrieve rolegroup specific resource config - let mut conf_rolegroup = role - .role_groups - .get(role_group) - .map(|rg| rg.config.config.clone()) - .unwrap_or_default(); - - // Merge more specific configs into default config - // Hierarchy is: - // 1. RoleGroup - // 2. Role - // 3. Default - conf_role.merge(&conf_defaults); - conf_rolegroup.merge(&conf_role); - - tracing::debug!("Merged config: {:?}", conf_rolegroup); - fragment::validate(conf_rolegroup).context(FragmentValidationFailureSnafu) - } } pub fn merged_env(rolegroup_config: Option<&BTreeMap>) -> Vec { let merged_env: Vec = if let Some(rolegroup_config) = rolegroup_config { - let env_vars_from_config: BTreeMap = rolegroup_config + rolegroup_config .iter() - .map(|(env_name, env_value)| { - ( - env_name.clone(), - EnvVar { - name: env_name.clone(), - value: Some(env_value.to_owned()), - value_from: None, - }, - ) + .map(|(env_name, env_value)| EnvVar { + name: env_name.clone(), + value: Some(env_value.to_owned()), + value_from: None, }) - .collect(); - env_vars_from_config.into_values().collect() + .collect() } else { vec![] }; merged_env } +pub enum AnyServiceConfig { + Master(HbaseConfig), + RegionServer(RegionServerConfig), + RestServer(HbaseConfig), +} + +impl AnyServiceConfig { + pub fn resources(&self) -> &Resources { + match self { + AnyServiceConfig::Master(config) => &config.resources, + AnyServiceConfig::RegionServer(config) => &config.resources, + AnyServiceConfig::RestServer(config) => &config.resources, + } + } + pub fn logging(&self) -> &Logging { + match self { + AnyServiceConfig::Master(config) => &config.logging, + AnyServiceConfig::RegionServer(config) => &config.logging, + AnyServiceConfig::RestServer(config) => &config.logging, + } + } + pub fn affinity(&self) -> &StackableAffinity { + match self { + AnyServiceConfig::Master(config) => &config.affinity, + AnyServiceConfig::RegionServer(config) => &config.affinity, + AnyServiceConfig::RestServer(config) => &config.affinity, + } + } + pub fn graceful_shutdown_timeout(&self) -> &Option { + match self { + AnyServiceConfig::Master(config) => &config.graceful_shutdown_timeout, + AnyServiceConfig::RegionServer(config) => &config.graceful_shutdown_timeout, + AnyServiceConfig::RestServer(config) => &config.graceful_shutdown_timeout, + } + } + pub fn requested_secret_lifetime(&self) -> Option { + match self { + AnyServiceConfig::Master(config) => config.requested_secret_lifetime, + AnyServiceConfig::RegionServer(config) => config.requested_secret_lifetime, + AnyServiceConfig::RestServer(config) => config.requested_secret_lifetime, + } + } + + /// Returns command line arguments to pass on to the region mover tool. + /// The following arguments are excluded because they are already part of the + /// hbase-entrypoint.sh script. + /// The most important argument, '--regionserverhost' can only be computed on the Pod + /// because it contains the pod's hostname. + /// + /// Returns an empty string if the region mover is disabled or any other role is "self". + pub fn region_mover_args(&self) -> String { + match self { + AnyServiceConfig::RegionServer(config) => { + if config.region_mover.run_before_shutdown { + let timeout = config + .graceful_shutdown_timeout + .map(|d| { + if d.as_secs() <= DEFAULT_REGION_MOVER_DELTA_TO_SHUTDOWN.as_secs() { + d.as_secs() + } else { + d.as_secs() - DEFAULT_REGION_MOVER_DELTA_TO_SHUTDOWN.as_secs() + } + }) + .unwrap_or(DEFAULT_REGION_MOVER_TIMEOUT.as_secs()); + let mut command = vec![ + "--maxthreads".to_string(), + config.region_mover.max_threads.to_string(), + "--timeout".to_string(), + timeout.to_string(), + ]; + if !config.region_mover.ack { + command.push("--noack".to_string()); + } + + command.extend( + config + .region_mover + .cli_opts + .iter() + .flat_map(|o| o.additional_mover_options.clone()) + .map(|s| escape(std::borrow::Cow::Borrowed(&s)).to_string()), + ); + command.join(" ") + } else { + "".to_string() + } + } + _ => "".to_string(), + } + } + + pub fn run_region_mover(&self) -> bool { + match self { + AnyServiceConfig::RegionServer(config) => config.region_mover.run_before_shutdown, + _ => false, + } + } +} + #[cfg(test)] mod tests { use std::collections::{BTreeMap, HashMap}; use indoc::indoc; use product_config::{types::PropertyNameKind, ProductConfigManager}; + use rstest::rstest; use stackable_operator::product_config_utils::{ transform_all_roles_to_config, validate_all_roles_and_groups_config, }; - use crate::{merged_env, HbaseCluster, HbaseRole}; + use crate::{merged_env, AnyServiceConfig, HbaseCluster, HbaseRole, RegionMoverExtraCliOpts}; #[test] pub fn test_env_overrides() { @@ -741,6 +1286,8 @@ spec: config: logging: enableVectorAgent: False + regionMover: + runBeforeShutdown: false roleGroups: default: replicas: 1 @@ -761,7 +1308,7 @@ spec: HbaseRole::Master.to_string(), ( vec![PropertyNameKind::Env], - hbase.get_role(&HbaseRole::Master).cloned().unwrap(), + hbase.spec.masters.clone().unwrap(), ), )]); @@ -788,8 +1335,6 @@ spec: .map(|env_var| (env_var.name.as_str(), env_var.value.clone())) .collect(); - println!("{:#?}", merged_env); - assert_eq!( Some(&Some("MASTER_RG".to_string())), env_map.get("TEST_VAR") @@ -803,4 +1348,77 @@ spec: env_map.get("TEST_VAR_FROM_MRG") ); } + + #[rstest] + #[case("default", false, 1, vec![])] + #[case("groupRegionMover", true, 5, vec!["--some".to_string(), "extra".to_string()])] + pub fn test_region_mover_merge( + #[case] role_group_name: &str, + #[case] run_before_shutdown: bool, + #[case] max_threads: u16, + #[case] additional_mover_options: Vec, + ) { + let input = indoc! {r#" +--- +apiVersion: hbase.stackable.tech/v1alpha1 +kind: HbaseCluster +metadata: + name: test-hbase +spec: + image: + productVersion: 2.4.18 + clusterConfig: + hdfsConfigMapName: test-hdfs + zookeeperConfigMapName: test-znode + masters: + roleGroups: + default: + replicas: 1 + restServers: + roleGroups: + default: + replicas: 1 + regionServers: + config: + regionMover: + runBeforeShutdown: False + roleGroups: + default: + replicas: 1 + groupRegionMover: + replicas: 1 + config: + regionMover: + runBeforeShutdown: True + maxThreads: 5 + additionalMoverOptions: ["--some", "extra"] + "#}; + + let deserializer = serde_yaml::Deserializer::from_str(input); + let hbase: HbaseCluster = + serde_yaml::with::singleton_map_recursive::deserialize(deserializer).unwrap(); + + let hbase_role = HbaseRole::RegionServer; + let rolegroup = hbase.server_rolegroup_ref(hbase_role.to_string(), role_group_name); + + let merged_config = hbase + .merged_config( + &hbase_role, + &rolegroup.role_group, + &hbase.spec.cluster_config.hdfs_config_map_name, + ) + .unwrap(); + if let AnyServiceConfig::RegionServer(config) = merged_config { + assert_eq!(run_before_shutdown, config.region_mover.run_before_shutdown); + assert_eq!(max_threads, config.region_mover.max_threads); + assert_eq!( + Some(RegionMoverExtraCliOpts { + additional_mover_options + }), + config.region_mover.cli_opts + ); + } else { + panic!("this shouldn't happen"); + }; + } } diff --git a/rust/operator-binary/Cargo.toml b/rust/operator-binary/Cargo.toml index b98a172f..9de4298e 100644 --- a/rust/operator-binary/Cargo.toml +++ b/rust/operator-binary/Cargo.toml @@ -16,7 +16,6 @@ clap.workspace = true const_format.workspace = true fnv.workspace = true futures.workspace = true -indoc.workspace = true product-config.workspace = true serde.workspace = true snafu.workspace = true diff --git a/rust/operator-binary/src/config/jvm.rs b/rust/operator-binary/src/config/jvm.rs index 8a40368e..015b2dd9 100644 --- a/rust/operator-binary/src/config/jvm.rs +++ b/rust/operator-binary/src/config/jvm.rs @@ -1,11 +1,11 @@ use snafu::{OptionExt, ResultExt, Snafu}; use stackable_hbase_crd::{ - HbaseConfig, HbaseConfigFragment, HbaseRole, CONFIG_DIR_NAME, JVM_SECURITY_PROPERTIES_FILE, + AnyServiceConfig, HbaseCluster, HbaseRole, CONFIG_DIR_NAME, JVM_SECURITY_PROPERTIES_FILE, METRICS_PORT, }; use stackable_operator::{ memory::{BinaryMultiple, MemoryQuantity}, - role_utils::{self, GenericRoleConfig, JavaCommonConfig, JvmArgumentOverrides, Role}, + role_utils::{self, JvmArgumentOverrides}, }; const JAVA_HEAP_FACTOR: f32 = 0.8; @@ -22,6 +22,9 @@ pub enum Error { #[snafu(display("failed to merge jvm argument overrides"))] MergeJvmArgumentOverrides { source: role_utils::Error }, + + #[snafu(display("the HBase role [{role}] is missing from spec"))] + MissingHbaseRole { role: String }, } // Applies to both the servers and the CLI @@ -43,14 +46,13 @@ pub fn construct_global_jvm_args(kerberos_enabled: bool) -> String { jvm_args.join(" ") } -/// JVM arguments that specifically for the role (server), so will *not* be used e.g. by CLI tools -fn construct_role_specific_jvm_args( +/// Arguments that go into `HBASE_OPTS`, so *not* the heap settings (which go into `HBASE_HEAPSIZE`). +pub fn construct_role_specific_non_heap_jvm_args( + hbase: &HbaseCluster, hbase_role: &HbaseRole, - role: &Role, role_group: &str, product_version: &str, - kerberos_enabled: bool, -) -> Result, Error> { +) -> Result { let mut jvm_args = vec![format!( "-Djava.security.properties={CONFIG_DIR_NAME}/{JVM_SECURITY_PROPERTIES_FILE}" )]; @@ -61,35 +63,46 @@ fn construct_role_specific_jvm_args( format!("-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/{hbase_role}.yaml") ); } - if kerberos_enabled { + if hbase.has_kerberos_enabled() { jvm_args.push("-Djava.security.krb5.conf=/stackable/kerberos/krb5.conf".to_owned()); } let operator_generated = JvmArgumentOverrides::new_with_only_additions(jvm_args); - let merged = role - .get_merged_jvm_argument_overrides(role_group, &operator_generated) - .context(MergeJvmArgumentOverridesSnafu)?; - Ok(merged + + let merged = match hbase_role { + HbaseRole::Master => hbase + .spec + .masters + .as_ref() + .context(MissingHbaseRoleSnafu { + role: hbase_role.to_string(), + })? + .get_merged_jvm_argument_overrides(role_group, &operator_generated) + .context(MergeJvmArgumentOverridesSnafu)?, + HbaseRole::RegionServer => hbase + .spec + .region_servers + .as_ref() + .context(MissingHbaseRoleSnafu { + role: hbase_role.to_string(), + })? + .get_merged_jvm_argument_overrides(role_group, &operator_generated) + .context(MergeJvmArgumentOverridesSnafu)?, + HbaseRole::RestServer => hbase + .spec + .rest_servers + .as_ref() + .context(MissingHbaseRoleSnafu { + role: hbase_role.to_string(), + })? + .get_merged_jvm_argument_overrides(role_group, &operator_generated) + .context(MergeJvmArgumentOverridesSnafu)?, + }; + jvm_args = merged .effective_jvm_config_after_merging() // Sorry for the clone, that's how operator-rs is currently modelled :P - .clone()) -} + .clone(); -/// Arguments that go into `HBASE_OPTS`, so *not* the heap settings (which go into `HBASE_HEAPSIZE`). -pub fn construct_role_specific_non_heap_jvm_args( - hbase_role: &HbaseRole, - role: &Role, - role_group: &str, - product_version: &str, - kerberos_enabled: bool, -) -> Result { - let mut jvm_args = construct_role_specific_jvm_args( - hbase_role, - role, - role_group, - product_version, - kerberos_enabled, - )?; jvm_args.retain(|arg| !is_heap_jvm_argument(arg)); Ok(jvm_args.join(" ")) @@ -102,10 +115,10 @@ pub fn construct_role_specific_non_heap_jvm_args( /// Looking at `bin/hbase`, you can actually add the `m` suffix to make the unit more clear, the /// script will detect this [here](https://github.com/apache/hbase/blob/777010361abb203b8b17673d84acf4f7f1d0283a/bin/hbase#L165) /// and work correctly. -pub fn construct_hbase_heapsize_env(merged_config: &HbaseConfig) -> Result { +pub fn construct_hbase_heapsize_env(merged_config: &AnyServiceConfig) -> Result { let heap_size = MemoryQuantity::try_from( merged_config - .resources + .resources() .memory .limit .as_ref() @@ -154,17 +167,15 @@ mod tests { default: replicas: 1 "#; - let (hbase_role, merged_config, role, role_group, product_version) = + let (hbase, hbase_role, merged_config, role_group, product_version) = construct_boilerplate(input); - let kerberos_enabled = false; - let global_jvm_args = construct_global_jvm_args(kerberos_enabled); + let global_jvm_args = construct_global_jvm_args(false); let role_specific_non_heap_jvm_args = construct_role_specific_non_heap_jvm_args( + &hbase, &hbase_role, - &role, &role_group, &product_version, - kerberos_enabled, ) .unwrap(); let hbase_heapsize_env = construct_hbase_heapsize_env(&merged_config).unwrap(); @@ -190,6 +201,10 @@ mod tests { clusterConfig: hdfsConfigMapName: simple-hdfs zookeeperConfigMapName: simple-znode + authentication: + tlsSecretClass: tls + kerberos: + secretClass: kerberos-simple masters: roleGroups: default: @@ -214,17 +229,15 @@ mod tests { - -Xmx40000m # This has no effect! - -Dhttps.proxyPort=1234 "#; - let (hbase_role, merged_config, role, role_group, product_version) = + let (hbase, hbase_role, merged_config, role_group, product_version) = construct_boilerplate(input); - let kerberos_enabled = true; - let global_jvm_args = construct_global_jvm_args(kerberos_enabled); + let global_jvm_args = construct_global_jvm_args(hbase.has_kerberos_enabled()); let role_specific_non_heap_jvm_args = construct_role_specific_non_heap_jvm_args( + &hbase, &hbase_role, - &role, &role_group, &product_version, - kerberos_enabled, ) .unwrap(); let hbase_heapsize_env = construct_hbase_heapsize_env(&merged_config).unwrap(); @@ -247,27 +260,19 @@ mod tests { fn construct_boilerplate( hbase_cluster: &str, - ) -> ( - HbaseRole, - HbaseConfig, - Role, - String, - String, - ) { + ) -> (HbaseCluster, HbaseRole, AnyServiceConfig, String, String) { let hbase: HbaseCluster = serde_yaml::from_str(hbase_cluster).expect("illegal test input"); let hbase_role = HbaseRole::RegionServer; let merged_config = hbase .merged_config(&hbase_role, "default", "my-hdfs") .unwrap(); - let role: Role = - hbase.spec.region_servers.unwrap(); let product_version = hbase.spec.image.product_version().to_owned(); ( + hbase, hbase_role, merged_config, - role, "default".to_owned(), product_version, ) diff --git a/rust/operator-binary/src/hbase_controller.rs b/rust/operator-binary/src/hbase_controller.rs index f5d46041..2dccf9ae 100644 --- a/rust/operator-binary/src/hbase_controller.rs +++ b/rust/operator-binary/src/hbase_controller.rs @@ -7,7 +7,6 @@ use std::{ }; use const_format::concatcp; -use indoc::formatdoc; use product_config::{ types::PropertyNameKind, writer::{to_hadoop_xml, to_java_properties_string, PropertiesWriterError}, @@ -15,10 +14,9 @@ use product_config::{ }; use snafu::{OptionExt, ResultExt, Snafu}; use stackable_hbase_crd::{ - merged_env, Container, HbaseCluster, HbaseClusterStatus, HbaseConfig, HbaseConfigFragment, - HbaseRole, APP_NAME, CONFIG_DIR_NAME, HBASE_ENV_SH, HBASE_REST_PORT_NAME_HTTP, - HBASE_REST_PORT_NAME_HTTPS, HBASE_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, SSL_CLIENT_XML, - SSL_SERVER_XML, + merged_env, AnyServiceConfig, Container, HbaseCluster, HbaseClusterStatus, HbaseRole, APP_NAME, + HBASE_ENV_SH, HBASE_REST_PORT_NAME_HTTP, HBASE_REST_PORT_NAME_HTTPS, HBASE_SITE_XML, + JVM_SECURITY_PROPERTIES_FILE, SSL_CLIENT_XML, SSL_SERVER_XML, }; use stackable_operator::{ builder::{ @@ -36,12 +34,11 @@ use stackable_operator::{ api::{ apps::v1::{StatefulSet, StatefulSetSpec}, core::v1::{ - ConfigMap, ConfigMapVolumeSource, ContainerPort, Probe, Service, ServiceAccount, - ServicePort, ServiceSpec, TCPSocketAction, Volume, + ConfigMap, ConfigMapVolumeSource, ContainerPort, EnvVar, Probe, Service, + ServiceAccount, ServicePort, ServiceSpec, TCPSocketAction, Volume, }, }, apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString}, - DeepMerge, }, kube::{ core::{error_boundary, DeserializeGuard}, @@ -54,21 +51,19 @@ use stackable_operator::{ product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, product_logging::{ self, - framework::{ - create_vector_shutdown_file_command, remove_vector_shutdown_file_command, LoggingError, - }, + framework::LoggingError, spec::{ ConfigMapLogConfig, ContainerLogConfig, ContainerLogConfigChoice, CustomContainerLogConfig, }, }, - role_utils::{GenericRoleConfig, JavaCommonConfig, Role, RoleGroupRef}, + role_utils::{GenericRoleConfig, RoleGroupRef}, status::condition::{ compute_conditions, operations::ClusterOperationsConditionBuilder, statefulset::StatefulSetConditionBuilder, }, time::Duration, - utils::{cluster_info::KubernetesClusterInfo, COMMON_BASH_TRAP_FUNCTIONS}, + utils::cluster_info::KubernetesClusterInfo, }; use strum::{EnumDiscriminants, IntoStaticStr, ParseError}; @@ -79,14 +74,13 @@ use crate::{ }, discovery::build_discovery_configmap, kerberos::{ - self, add_kerberos_pod_config, kerberos_config_properties, - kerberos_container_start_commands, kerberos_ssl_client_settings, + self, add_kerberos_pod_config, kerberos_config_properties, kerberos_ssl_client_settings, kerberos_ssl_server_settings, }, operations::{graceful_shutdown::add_graceful_shutdown_config, pdb::add_pdbs}, product_logging::{ - extend_role_group_config_map, log4j_properties_file_name, - resolve_vector_aggregator_address, STACKABLE_LOG_DIR, + extend_role_group_config_map, resolve_vector_aggregator_address, + CONTAINERDEBUG_LOG_DIRECTORY, STACKABLE_LOG_DIR, }, security, security::opa::HbaseOpaConfig, @@ -101,6 +95,8 @@ pub const MAX_HBASE_LOG_FILES_SIZE: MemoryQuantity = MemoryQuantity { unit: BinaryMultiple::Mebi, }; +// These constants are hard coded in hbase-entrypoint.sh +// You need to change them there too. const HDFS_DISCOVERY_TMP_DIR: &str = "/stackable/tmp/hdfs"; const HBASE_CONFIG_TMP_DIR: &str = "/stackable/tmp/hbase"; const HBASE_LOG_CONFIG_TMP_DIR: &str = "/stackable/tmp/log_config"; @@ -117,6 +113,9 @@ pub struct Ctx { #[strum_discriminants(derive(IntoStaticStr))] #[allow(clippy::enum_variant_names)] pub enum Error { + #[snafu(display("invalid role properties"))] + RoleProperties { source: stackable_hbase_crd::Error }, + #[snafu(display("missing secret lifetime"))] MissingSecretLifetime, @@ -356,7 +355,7 @@ pub async fn reconcile_hbase( .await .context(ResolveVectorAggregatorAddressSnafu)?; - let roles = build_roles(hbase)?; + let roles = hbase.build_role_properties().context(RolePropertiesSnafu)?; let validated_config = validate_all_roles_and_groups_config( &resolved_product_image.app_version_label, @@ -429,11 +428,6 @@ pub async fn reconcile_hbase( role: role_name.to_string(), })?; for (rolegroup_name, rolegroup_config) in group_config.iter() { - let role = hbase - .get_role(&hbase_role) - .with_context(|| MissingHbaseRoleSnafu { - role: hbase_role.to_string(), - })?; let rolegroup = hbase.server_rolegroup_ref(role_name, rolegroup_name); let merged_config = hbase @@ -449,7 +443,6 @@ pub async fn reconcile_hbase( let rg_configmap = build_rolegroup_config_map( hbase, &client.kubernetes_cluster_info, - role, &rolegroup, rolegroup_config, &zookeeper_connection_information, @@ -460,6 +453,7 @@ pub async fn reconcile_hbase( )?; let rg_statefulset = build_rolegroup_statefulset( hbase, + &client.kubernetes_cluster_info, &hbase_role, &rolegroup, rolegroup_config, @@ -577,11 +571,10 @@ pub fn build_region_server_role_service( fn build_rolegroup_config_map( hbase: &HbaseCluster, cluster_info: &KubernetesClusterInfo, - role: &Role, rolegroup: &RoleGroupRef, rolegroup_config: &HashMap>, zookeeper_connection_information: &ZookeeperConnectionInformation, - merged_config: &HbaseConfig, + merged_config: &AnyServiceConfig, resolved_product_image: &ResolvedProductImage, hbase_opa_config: Option<&HbaseOpaConfig>, vector_aggregator_address: Option<&str>, @@ -623,7 +616,6 @@ fn build_rolegroup_config_map( hbase, merged_config, &hbase_role, - role, &rolegroup.role_group, &resolved_product_image.product_version, )?; @@ -713,7 +705,7 @@ fn build_rolegroup_config_map( extend_role_group_config_map( rolegroup, vector_aggregator_address, - &merged_config.logging, + merged_config.logging(), &mut builder, &resolved_product_image.product_version, ) @@ -789,21 +781,19 @@ fn build_rolegroup_service( /// The rolegroup [`StatefulSet`] runs the rolegroup, as configured by the administrator. /// /// The [`Pod`](`stackable_operator::k8s_openapi::api::core::v1::Pod`)s are accessible through the corresponding [`Service`] (from [`build_rolegroup_service`]). +#[allow(clippy::too_many_arguments)] fn build_rolegroup_statefulset( hbase: &HbaseCluster, + cluster_info: &KubernetesClusterInfo, hbase_role: &HbaseRole, rolegroup_ref: &RoleGroupRef, rolegroup_config: &HashMap>, - merged_config: &HbaseConfig, + merged_config: &AnyServiceConfig, resolved_product_image: &ResolvedProductImage, service_account: &ServiceAccount, ) -> Result { let hbase_version = &resolved_product_image.app_version_label; - // In hbase-op the restserver role is optional :/ - let role = hbase.get_role(hbase_role); - let role_group = role.and_then(|r| r.role_groups.get(&rolegroup_ref.role_group)); - let ports = hbase .ports(hbase_role, &resolved_product_image.product_version) .into_iter() @@ -869,49 +859,40 @@ fn build_rolegroup_statefulset( ..probe_template }; - let merged_env = merged_env(rolegroup_config.get(&PropertyNameKind::Env)); + let mut merged_env = merged_env(rolegroup_config.get(&PropertyNameKind::Env)); + // This env var is set for all roles to avoid bash's "unbound variable" errors + merged_env.extend([ + EnvVar { + name: "REGION_MOVER_OPTS".to_string(), + value: Some(merged_config.region_mover_args()), + ..EnvVar::default() + }, + EnvVar { + name: "RUN_REGION_MOVER".to_string(), + value: Some(merged_config.run_region_mover().to_string()), + ..EnvVar::default() + }, + EnvVar { + name: "STACKABLE_LOG_DIR".to_string(), + value: Some(STACKABLE_LOG_DIR.to_string()), + ..EnvVar::default() + }, + ]); - let log4j_properties_file_name = - log4j_properties_file_name(&resolved_product_image.product_version); let mut hbase_container = ContainerBuilder::new("hbase").expect("ContainerBuilder not created"); hbase_container .image_from_product_image(resolved_product_image) - .command(vec![ - "/bin/bash".to_string(), - "-x".to_string(), - "-euo".to_string(), - "pipefail".to_string(), - "-c".to_string(), + .command(vec!["/stackable/hbase/bin/hbase-entrypoint.sh".to_string()]) + .args(vec![ + hbase_role.cli_role_name(), + hbase_service_domain_name(hbase, rolegroup_ref, cluster_info)?, + hbase.service_port(hbase_role).to_string(), ]) - .args(vec![formatdoc! {" - mkdir -p {CONFIG_DIR_NAME} - cp {HDFS_DISCOVERY_TMP_DIR}/hdfs-site.xml {CONFIG_DIR_NAME} - cp {HDFS_DISCOVERY_TMP_DIR}/core-site.xml {CONFIG_DIR_NAME} - cp {HBASE_CONFIG_TMP_DIR}/* {CONFIG_DIR_NAME} - cp {HBASE_LOG_CONFIG_TMP_DIR}/{log4j_properties_file_name} {CONFIG_DIR_NAME} - - {kerberos_container_start_commands} - - {COMMON_BASH_TRAP_FUNCTIONS} - {remove_vector_shutdown_file_command} - prepare_signal_handlers - containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop & - bin/hbase {hbase_role_name_in_command} start & - wait_for_termination $! - {create_vector_shutdown_file_command} - ", - hbase_role_name_in_command = hbase_role.cli_role_name(), - kerberos_container_start_commands = kerberos_container_start_commands(hbase), - remove_vector_shutdown_file_command = - remove_vector_shutdown_file_command(STACKABLE_LOG_DIR), - create_vector_shutdown_file_command = - create_vector_shutdown_file_command(STACKABLE_LOG_DIR), - }]) .add_env_vars(merged_env) // Needed for the `containerdebug` process to log it's tracing information to. .add_env_var( "CONTAINERDEBUG_LOG_DIRECTORY", - format!("{STACKABLE_LOG_DIR}/containerdebug"), + &*CONTAINERDEBUG_LOG_DIRECTORY, ) .add_volume_mount("hbase-config", HBASE_CONFIG_TMP_DIR) .context(AddVolumeMountSnafu)? @@ -922,7 +903,7 @@ fn build_rolegroup_statefulset( .add_volume_mount("log", STACKABLE_LOG_DIR) .context(AddVolumeMountSnafu)? .add_container_ports(ports) - .resources(merged_config.resources.clone().into()) + .resources(merged_config.resources().clone().into()) .startup_probe(startup_probe) .liveness_probe(liveness_probe) .readiness_probe(readiness_probe); @@ -942,7 +923,7 @@ fn build_rolegroup_statefulset( pod_builder .metadata(pb_metadata) .image_pull_secrets_from_product_image(resolved_product_image) - .affinity(&merged_config.affinity) + .affinity(merged_config.affinity()) .add_volume(stackable_operator::k8s_openapi::api::core::v1::Volume { name: "hbase-config".to_string(), config_map: Some(ConfigMapVolumeSource { @@ -982,7 +963,7 @@ fn build_rolegroup_statefulset( Some(ContainerLogConfigChoice::Custom(CustomContainerLogConfig { custom: ConfigMapLogConfig { config_map }, })), - }) = merged_config.logging.containers.get(&Container::Hbase) + }) = merged_config.logging().containers.get(&Container::Hbase) { pod_builder .add_volume(Volume { @@ -1011,11 +992,10 @@ fn build_rolegroup_statefulset( if hbase.has_kerberos_enabled() { add_kerberos_pod_config( hbase, - hbase_role, &mut hbase_container, &mut pod_builder, merged_config - .requested_secret_lifetime + .requested_secret_lifetime() .context(MissingSecretLifetimeSnafu)?, ) .context(AddKerberosConfigSnafu)?; @@ -1023,13 +1003,13 @@ fn build_rolegroup_statefulset( pod_builder.add_container(hbase_container.build()); // Vector sidecar shall be the last container in the list - if merged_config.logging.enable_vector_agent { + if merged_config.logging().enable_vector_agent { pod_builder.add_container( product_logging::framework::vector_container( resolved_product_image, "hbase-config", "log", - merged_config.logging.containers.get(&Container::Vector), + merged_config.logging().containers.get(&Container::Vector), ResourceRequirementsBuilder::new() .with_cpu_request("250m") .with_cpu_limit("500m") @@ -1042,13 +1022,7 @@ fn build_rolegroup_statefulset( } let mut pod_template = pod_builder.build_template(); - - if let Some(role) = role { - pod_template.merge_from(role.config.pod_overrides.clone()); - } - if let Some(role_group) = role_group { - pod_template.merge_from(role_group.config.pod_overrides.clone()); - } + hbase.merge_pod_overrides(&mut pod_template, hbase_role, rolegroup_ref); let metadata = ObjectMetaBuilder::new() .name_and_namespace(hbase) @@ -1074,7 +1048,7 @@ fn build_rolegroup_statefulset( let statefulset_spec = StatefulSetSpec { pod_management_policy: Some("Parallel".to_string()), - replicas: role_group.and_then(|rg| rg.replicas).map(i32::from), + replicas: hbase.replicas(hbase_role, rolegroup_ref), selector: LabelSelector { match_labels: Some(statefulset_match_labels.into()), ..LabelSelector::default() @@ -1091,61 +1065,6 @@ fn build_rolegroup_statefulset( }) } -// The result type is only defined once, there is no value in extracting it into a type definition. -#[allow(clippy::type_complexity)] -fn build_roles( - hbase: &HbaseCluster, -) -> Result< - HashMap< - String, - ( - Vec, - Role, - ), - >, -> { - let config_types = vec![ - PropertyNameKind::Env, - PropertyNameKind::File(HBASE_ENV_SH.to_string()), - PropertyNameKind::File(HBASE_SITE_XML.to_string()), - PropertyNameKind::File(SSL_SERVER_XML.to_string()), - PropertyNameKind::File(SSL_CLIENT_XML.to_string()), - PropertyNameKind::File(JVM_SECURITY_PROPERTIES_FILE.to_string()), - ]; - - let mut roles = HashMap::from([ - ( - HbaseRole::Master.to_string(), - ( - config_types.to_owned(), - hbase - .get_role(&HbaseRole::Master) - .cloned() - .context(NoMasterRoleSnafu)?, - ), - ), - ( - HbaseRole::RegionServer.to_string(), - ( - config_types.to_owned(), - hbase - .get_role(&HbaseRole::RegionServer) - .cloned() - .context(NoRegionServerRoleSnafu)?, - ), - ), - ]); - - if let Some(rest_servers) = hbase.get_role(&HbaseRole::RestServer) { - roles.insert( - HbaseRole::RestServer.to_string(), - (config_types, rest_servers.to_owned()), - ); - } - - Ok(roles) -} - fn write_hbase_env_sh<'a, T>(properties: T) -> String where T: Iterator, @@ -1188,9 +1107,8 @@ pub fn build_recommended_labels<'a>( /// The content of the HBase `hbase-env.sh` file. fn build_hbase_env_sh( hbase: &HbaseCluster, - merged_config: &HbaseConfig, + merged_config: &AnyServiceConfig, hbase_role: &HbaseRole, - role: &Role, role_group: &str, product_version: &str, ) -> Result, Error> { @@ -1206,14 +1124,9 @@ fn build_hbase_env_sh( "HBASE_OPTS".to_owned(), construct_global_jvm_args(hbase.has_kerberos_enabled()), ); - let role_specific_non_heap_jvm_args = construct_role_specific_non_heap_jvm_args( - hbase_role, - role, - role_group, - product_version, - hbase.has_kerberos_enabled(), - ) - .context(ConstructJvmArgumentSnafu)?; + let role_specific_non_heap_jvm_args = + construct_role_specific_non_heap_jvm_args(hbase, hbase_role, role_group, product_version) + .context(ConstructJvmArgumentSnafu)?; match hbase_role { HbaseRole::Master => { result.insert( @@ -1256,6 +1169,28 @@ fn validate_cr(hbase: &HbaseCluster) -> Result<()> { Ok(()) } +/// Build the domain name of an HBase service pod. +/// The hbase-entrypoint.sh script uses this to build the fully qualified name of a pod +/// by appending it to the `HOSTNAME` environment variable. +/// This name is required by the RegionMover to function properly. +fn hbase_service_domain_name( + hbase: &HbaseCluster, + rolegroup_ref: &RoleGroupRef, + cluster_info: &KubernetesClusterInfo, +) -> Result { + let hbase_cluster_name = rolegroup_ref.object_name(); + let pod_namespace = hbase + .metadata + .namespace + .clone() + .context(ObjectHasNoNamespaceSnafu)?; + let cluster_domain = &cluster_info.cluster_domain; + + Ok(format!( + "{hbase_cluster_name}.{pod_namespace}.svc.{cluster_domain}" + )) +} + #[cfg(test)] mod test { use rstest::rstest; diff --git a/rust/operator-binary/src/kerberos.rs b/rust/operator-binary/src/kerberos.rs index e19aa33a..0ea1f0a1 100644 --- a/rust/operator-binary/src/kerberos.rs +++ b/rust/operator-binary/src/kerberos.rs @@ -1,10 +1,7 @@ use std::collections::BTreeMap; -use indoc::formatdoc; use snafu::{OptionExt, ResultExt, Snafu}; -use stackable_hbase_crd::{ - HbaseCluster, HbaseRole, TLS_STORE_DIR, TLS_STORE_PASSWORD, TLS_STORE_VOLUME_NAME, -}; +use stackable_hbase_crd::{HbaseCluster, TLS_STORE_DIR, TLS_STORE_PASSWORD, TLS_STORE_VOLUME_NAME}; use stackable_operator::{ builder::{ self, @@ -76,21 +73,21 @@ pub fn kerberos_config_properties( "hbase.master.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::Master.kerberos_service_name() + service_name = kerberos_service_name() ), ), ( "hbase.regionserver.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::RegionServer.kerberos_service_name() + service_name = kerberos_service_name() ), ), ( "hbase.rest.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::RestServer.kerberos_service_name() + service_name = kerberos_service_name() ), ), ( @@ -157,21 +154,21 @@ pub fn kerberos_discovery_config_properties( "hbase.master.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::Master.kerberos_service_name() + service_name = kerberos_service_name() ), ), ( "hbase.regionserver.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::RegionServer.kerberos_service_name() + service_name = kerberos_service_name() ), ), ( "hbase.rest.kerberos.principal".to_string(), format!( "{service_name}/{principal_host_part}", - service_name = HbaseRole::RestServer.kerberos_service_name() + service_name = kerberos_service_name() ), ), ])) @@ -230,7 +227,6 @@ pub fn kerberos_ssl_client_settings(hbase: &HbaseCluster) -> BTreeMap String { - if !hbase.has_kerberos_enabled() { - return String::new(); - } - - formatdoc! {" - export KERBEROS_REALM=$(grep -oP 'default_realm = \\K.*' /stackable/kerberos/krb5.conf)" - } -} - fn principal_host_part( hbase: &HbaseCluster, cluster_info: &KubernetesClusterInfo, @@ -303,3 +289,17 @@ fn principal_host_part( "{hbase_name}.{hbase_namespace}.svc.{cluster_domain}@${{env.KERBEROS_REALM}}" )) } + +/// We could have different service names depended on the role (e.g. "hbase-master", "hbase-regionserver" and +/// "hbase-restserver"). However this produces error messages such as +/// [RpcServer.priority.RWQ.Fifo.write.handler=0,queue=0,port=16020] security.ShellBasedUnixGroupsMapping: unable to return groups for user hbase-master PartialGroupNameException The user name 'hbase-master' is not found. id: 'hbase-master': no such user +/// or +/// Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.security.AccessDeniedException): org.apache.hadoop.hbase.security.AccessDeniedException: Insufficient permissions (user=hbase-master/hbase-master-default-1.hbase-master-default.kuttl-test-poetic-sunbeam.svc.cluster.local@CLUSTER.LOCAL, scope=hbase:meta, family=table:state, params=[table=hbase:meta,family=table:state],action=WRITE) +/// +/// Also the documentation states: +/// > A Kerberos principal has three parts, with the form username/fully.qualified.domain.name@YOUR-REALM.COM. We recommend using hbase as the username portion. +/// +/// As a result we use "hbase" everywhere (which e.g. differs from the current hdfs implementation) +fn kerberos_service_name() -> &'static str { + "hbase" +} diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 509c3260..48e06463 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -1,5 +1,5 @@ use snafu::{ResultExt, Snafu}; -use stackable_hbase_crd::HbaseConfig; +use stackable_hbase_crd::AnyServiceConfig; use stackable_operator::builder::pod::PodBuilder; #[derive(Debug, Snafu)] @@ -11,14 +11,14 @@ pub enum Error { } pub fn add_graceful_shutdown_config( - merged_config: &HbaseConfig, + merged_config: &AnyServiceConfig, pod_builder: &mut PodBuilder, ) -> Result<(), Error> { // This must be always set by the merge mechanism, as we provide a default value, // users can not disable graceful shutdown. - if let Some(graceful_shutdown_timeout) = merged_config.graceful_shutdown_timeout { + if let Some(graceful_shutdown_timeout) = merged_config.graceful_shutdown_timeout() { pod_builder - .termination_grace_period(&graceful_shutdown_timeout) + .termination_grace_period(graceful_shutdown_timeout) .context(SetTerminationGracePeriodSnafu)?; } diff --git a/rust/operator-binary/src/product_logging.rs b/rust/operator-binary/src/product_logging.rs index 790923dc..d86f2bab 100644 --- a/rust/operator-binary/src/product_logging.rs +++ b/rust/operator-binary/src/product_logging.rs @@ -50,6 +50,8 @@ const HBASE_LOG4J2_FILE: &str = "hbase.log4j2.xml"; pub const LOG4J_CONFIG_FILE: &str = "log4j.properties"; pub const LOG4J2_CONFIG_FILE: &str = "log4j2.properties"; pub const STACKABLE_LOG_DIR: &str = "/stackable/log"; +pub static CONTAINERDEBUG_LOG_DIRECTORY: std::sync::LazyLock = + std::sync::LazyLock::new(|| format!("{STACKABLE_LOG_DIR}/containerdebug")); /// Return the address of the Vector aggregator if the corresponding ConfigMap name is given in the /// cluster spec diff --git a/tests/templates/kuttl/cluster-operation/01-install-zookeeper.yaml.j2 b/tests/templates/kuttl/cluster-operation/01-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/cluster-operation/01-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/01-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/02-install-hdfs.yaml.j2 b/tests/templates/kuttl/cluster-operation/02-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/cluster-operation/02-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/02-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/03-install-hbase.yaml.j2 b/tests/templates/kuttl/cluster-operation/03-install-hbase.yaml.j2 index f68aee37..4732f740 100644 --- a/tests/templates/kuttl/cluster-operation/03-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/03-install-hbase.yaml.j2 @@ -26,6 +26,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -33,6 +34,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -40,6 +42,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/10-pause-hbase.yaml.j2 b/tests/templates/kuttl/cluster-operation/10-pause-hbase.yaml.j2 index b0862dae..ffc8c7c0 100644 --- a/tests/templates/kuttl/cluster-operation/10-pause-hbase.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/10-pause-hbase.yaml.j2 @@ -29,6 +29,7 @@ spec: reconciliationPaused: true masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -36,6 +37,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -43,6 +45,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/20-stop-hbase.yaml.j2 b/tests/templates/kuttl/cluster-operation/20-stop-hbase.yaml.j2 index 80acdb16..0f4c5665 100644 --- a/tests/templates/kuttl/cluster-operation/20-stop-hbase.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/20-stop-hbase.yaml.j2 @@ -29,6 +29,7 @@ spec: reconciliationPaused: false masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -36,6 +37,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -43,6 +45,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/cluster-operation/30-restart-hbase.yaml.j2 b/tests/templates/kuttl/cluster-operation/30-restart-hbase.yaml.j2 index c507e82e..388110b2 100644 --- a/tests/templates/kuttl/cluster-operation/30-restart-hbase.yaml.j2 +++ b/tests/templates/kuttl/cluster-operation/30-restart-hbase.yaml.j2 @@ -29,6 +29,7 @@ spec: reconciliationPaused: false masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -36,6 +37,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -43,6 +45,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/kerberos/10-install-zk.yaml.j2 b/tests/templates/kuttl/kerberos/10-install-zk.yaml.j2 index 6bbee739..01330ba4 100644 --- a/tests/templates/kuttl/kerberos/10-install-zk.yaml.j2 +++ b/tests/templates/kuttl/kerberos/10-install-zk.yaml.j2 @@ -6,6 +6,7 @@ metadata: spec: image: productVersion: "{{ test_scenario['values']['zookeeper-latest'] }}" + pullPolicy: IfNotPresent clusterConfig: listenerClass: {{ test_scenario['values']['listener-class'] }} {% if lookup('env', 'VECTOR_AGGREGATOR') %} @@ -13,6 +14,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/kerberos/11-install-hdfs.yaml.j2 b/tests/templates/kuttl/kerberos/11-install-hdfs.yaml.j2 index 961a686e..305043bf 100644 --- a/tests/templates/kuttl/kerberos/11-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/kerberos/11-install-hdfs.yaml.j2 @@ -19,6 +19,7 @@ commands: spec: image: productVersion: "{{ test_scenario['values']['hdfs-latest'] }}" + pullPolicy: IfNotPresent clusterConfig: zookeeperConfigMapName: hdfs-znode dfsReplication: 1 @@ -31,6 +32,7 @@ commands: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m listenerClass: {{ test_scenario['values']['listener-class'] }} logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -43,6 +45,7 @@ commands: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m listenerClass: {{ test_scenario['values']['listener-class'] }} logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -52,6 +55,7 @@ commands: replicas: 2 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} configOverrides: *configOverrides diff --git a/tests/templates/kuttl/kerberos/20-access-hdfs.yaml.j2 b/tests/templates/kuttl/kerberos/20-access-hdfs.yaml.j2 index 4aeb7d7e..81a7bfd0 100644 --- a/tests/templates/kuttl/kerberos/20-access-hdfs.yaml.j2 +++ b/tests/templates/kuttl/kerberos/20-access-hdfs.yaml.j2 @@ -16,6 +16,7 @@ commands: containers: - name: access-hdfs image: oci.stackable.tech/sdp/hadoop:{{ test_scenario['values']['hdfs-latest'] }}-stackable0.0.0-dev + imagePullPolicy: IfNotPresent env: - name: HADOOP_CONF_DIR value: /stackable/conf/hdfs diff --git a/tests/templates/kuttl/kerberos/30-install-hbase.yaml.j2 b/tests/templates/kuttl/kerberos/30-install-hbase.yaml.j2 index 31a27a68..28766a48 100644 --- a/tests/templates/kuttl/kerberos/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/kerberos/30-install-hbase.yaml.j2 @@ -24,6 +24,7 @@ commands: {% else %} productVersion: "{{ test_scenario['values']['hbase'] }}" {% endif %} + pullPolicy: IfNotPresent clusterConfig: hdfsConfigMapName: hdfs zookeeperConfigMapName: hbase-znode @@ -37,6 +38,7 @@ commands: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} resources: @@ -47,6 +49,7 @@ commands: replicas: 2 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -54,6 +57,7 @@ commands: replicas: 2 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/kerberos/41-access-hbase.yaml.j2 b/tests/templates/kuttl/kerberos/41-access-hbase.yaml.j2 index 8d10b83a..1cbe5744 100644 --- a/tests/templates/kuttl/kerberos/41-access-hbase.yaml.j2 +++ b/tests/templates/kuttl/kerberos/41-access-hbase.yaml.j2 @@ -20,6 +20,7 @@ commands: {% else %} image: oci.stackable.tech/sdp/hbase:{{ test_scenario['values']['hbase'] }}-stackable0.0.0-dev {% endif %} + imagePullPolicy: IfNotPresent env: - name: HBASE_CONF_DIR value: /stackable/conf/hbase diff --git a/tests/templates/kuttl/logging/02-install-zookeeper.yaml.j2 b/tests/templates/kuttl/logging/02-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/logging/02-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/logging/02-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/logging/03-install-hdfs.yaml.j2 b/tests/templates/kuttl/logging/03-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/logging/03-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/logging/03-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/logging/05-install-hbase.yaml.j2 b/tests/templates/kuttl/logging/05-install-hbase.yaml.j2 index 2010483f..d12115f8 100644 --- a/tests/templates/kuttl/logging/05-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/logging/05-install-hbase.yaml.j2 @@ -39,6 +39,8 @@ spec: zookeeperConfigMapName: test-znode vectorAggregatorConfigMapName: hbase-vector-aggregator-discovery masters: + config: + gracefulShutdownTimeout: 1m roleGroups: automatic-log-config: replicas: 1 @@ -83,6 +85,8 @@ spec: custom: configMap: hbase-log-config regionServers: + config: + gracefulShutdownTimeout: 1m roleGroups: automatic-log-config: replicas: 1 @@ -116,6 +120,8 @@ spec: custom: configMap: hbase-log-config restServers: + config: + gracefulShutdownTimeout: 1m roleGroups: automatic-log-config: replicas: 1 diff --git a/tests/templates/kuttl/omid/10-install-zookeeper.yaml.j2 b/tests/templates/kuttl/omid/10-install-zookeeper.yaml.j2 index ddff0254..33da088d 100644 --- a/tests/templates/kuttl/omid/10-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/omid/10-install-zookeeper.yaml.j2 @@ -19,6 +19,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/omid/20-install-hdfs.yaml.j2 b/tests/templates/kuttl/omid/20-install-hdfs.yaml.j2 index 2e0c6c7b..80b12980 100644 --- a/tests/templates/kuttl/omid/20-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/omid/20-install-hdfs.yaml.j2 @@ -19,6 +19,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m listenerClass: cluster-internal logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -27,6 +28,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m listenerClass: cluster-internal logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -35,6 +37,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/omid/30-assert.yaml b/tests/templates/kuttl/omid/30-assert.yaml index 9943096c..2f715985 100644 --- a/tests/templates/kuttl/omid/30-assert.yaml +++ b/tests/templates/kuttl/omid/30-assert.yaml @@ -12,7 +12,7 @@ metadata: spec: template: spec: - terminationGracePeriodSeconds: 1200 + terminationGracePeriodSeconds: 60 status: readyReplicas: 2 replicas: 2 @@ -24,7 +24,7 @@ metadata: spec: template: spec: - terminationGracePeriodSeconds: 3600 + terminationGracePeriodSeconds: 60 status: readyReplicas: 1 replicas: 1 @@ -36,7 +36,7 @@ metadata: spec: template: spec: - terminationGracePeriodSeconds: 300 + terminationGracePeriodSeconds: 60 status: readyReplicas: 1 replicas: 1 diff --git a/tests/templates/kuttl/omid/30-install-hbase.yaml.j2 b/tests/templates/kuttl/omid/30-install-hbase.yaml.j2 index 21b484aa..da39bab6 100644 --- a/tests/templates/kuttl/omid/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/omid/30-install-hbase.yaml.j2 @@ -21,6 +21,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +29,7 @@ spec: replicas: 2 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -35,6 +37,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/opa/10-install-zk.yaml.j2 b/tests/templates/kuttl/opa/10-install-zk.yaml.j2 index 7bb5607e..2d229e93 100644 --- a/tests/templates/kuttl/opa/10-install-zk.yaml.j2 +++ b/tests/templates/kuttl/opa/10-install-zk.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/opa/11-install-opa.yaml.j2 b/tests/templates/kuttl/opa/11-install-opa.yaml.j2 index b15bc23a..61392e99 100644 --- a/tests/templates/kuttl/opa/11-install-opa.yaml.j2 +++ b/tests/templates/kuttl/opa/11-install-opa.yaml.j2 @@ -8,5 +8,7 @@ spec: productVersion: "{{ test_scenario['values']['opa'] }}" pullPolicy: IfNotPresent servers: + config: + gracefulShutdownTimeout: 1m roleGroups: default: {} diff --git a/tests/templates/kuttl/opa/16-install-hdfs.yaml.j2 b/tests/templates/kuttl/opa/16-install-hdfs.yaml.j2 index 2332a1ba..f3e11338 100644 --- a/tests/templates/kuttl/opa/16-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/opa/16-install-hdfs.yaml.j2 @@ -35,6 +35,7 @@ commands: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m listenerClass: 'cluster-internal' logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -58,6 +59,7 @@ commands: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m listenerClass: 'cluster-internal' logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -67,6 +69,7 @@ commands: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} configOverrides: *configOverrides diff --git a/tests/templates/kuttl/opa/30-install-hbase.yaml.j2 b/tests/templates/kuttl/opa/30-install-hbase.yaml.j2 index e71f6109..92fda28c 100644 --- a/tests/templates/kuttl/opa/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/opa/30-install-hbase.yaml.j2 @@ -65,6 +65,7 @@ commands: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -79,6 +80,7 @@ commands: configMap: hbase-log-config regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -93,6 +95,7 @@ commands: configMap: hbase-log-config restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/orphaned_resources/01-install-zookeeper.yaml.j2 b/tests/templates/kuttl/orphaned_resources/01-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/orphaned_resources/01-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/orphaned_resources/01-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/orphaned_resources/02-install-hdfs.yaml.j2 b/tests/templates/kuttl/orphaned_resources/02-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/orphaned_resources/02-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/orphaned_resources/02-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/orphaned_resources/03-install-hbase.yaml.j2 b/tests/templates/kuttl/orphaned_resources/03-install-hbase.yaml.j2 index 08d33e65..bd1c04d5 100644 --- a/tests/templates/kuttl/orphaned_resources/03-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/orphaned_resources/03-install-hbase.yaml.j2 @@ -20,6 +20,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -27,6 +28,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -34,6 +36,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/overrides/10-install-zookeeper.yaml.j2 b/tests/templates/kuttl/overrides/10-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/overrides/10-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/overrides/10-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/overrides/20-install-hdfs.yaml.j2 b/tests/templates/kuttl/overrides/20-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/overrides/20-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/overrides/20-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/overrides/30-install-hbase.yaml.j2 b/tests/templates/kuttl/overrides/30-install-hbase.yaml.j2 index 61d99a2c..e3dd8ff9 100644 --- a/tests/templates/kuttl/overrides/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/overrides/30-install-hbase.yaml.j2 @@ -23,6 +23,7 @@ spec: TEST_VAR_FROM_MASTER: MASTER TEST_VAR: MASTER config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -36,6 +37,7 @@ spec: TEST_VAR_FROM_RS: REGIONSERVER TEST_VAR: REGIONSERVER config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -54,6 +56,7 @@ spec: TEST_VAR_FROM_REST: RESTSERVER TEST_VAR: RESTSERVER config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/profiling/01-install-zookeeper.yaml.j2 b/tests/templates/kuttl/profiling/01-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/profiling/01-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/profiling/01-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/profiling/02-install-hdfs.yaml.j2 b/tests/templates/kuttl/profiling/02-install-hdfs.yaml.j2 index 7d2b795b..f9194a60 100644 --- a/tests/templates/kuttl/profiling/02-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/profiling/02-install-hdfs.yaml.j2 @@ -15,6 +15,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -22,6 +23,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -29,6 +31,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/profiling/03-install-hbase.yaml.j2 b/tests/templates/kuttl/profiling/03-install-hbase.yaml.j2 index 74038173..f8c40ed2 100644 --- a/tests/templates/kuttl/profiling/03-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/profiling/03-install-hbase.yaml.j2 @@ -20,6 +20,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -27,6 +28,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -34,6 +36,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/resources/10-install-zookeeper.yaml.j2 b/tests/templates/kuttl/resources/10-install-zookeeper.yaml.j2 index cdfc8dbf..0a331d50 100644 --- a/tests/templates/kuttl/resources/10-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/resources/10-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/resources/20-install-hdfs.yaml.j2 b/tests/templates/kuttl/resources/20-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/resources/20-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/resources/20-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/resources/30-install-hbase.yaml.j2 b/tests/templates/kuttl/resources/30-install-hbase.yaml.j2 index a4aecf82..89fb0bc6 100644 --- a/tests/templates/kuttl/resources/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/resources/30-install-hbase.yaml.j2 @@ -20,6 +20,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -27,6 +28,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} resources: @@ -59,6 +61,7 @@ spec: cpu: 2100m restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/shutdown/00-limit-range.yaml b/tests/templates/kuttl/shutdown/00-limit-range.yaml new file mode 100644 index 00000000..7b6cb30e --- /dev/null +++ b/tests/templates/kuttl/shutdown/00-limit-range.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v1 +kind: LimitRange +metadata: + name: limit-request-ratio +spec: + limits: + - type: "Container" + maxLimitRequestRatio: + cpu: 5 + memory: 1 diff --git a/tests/templates/kuttl/shutdown/00-patch-ns.yaml.j2 b/tests/templates/kuttl/shutdown/00-patch-ns.yaml.j2 new file mode 100644 index 00000000..67185acf --- /dev/null +++ b/tests/templates/kuttl/shutdown/00-patch-ns.yaml.j2 @@ -0,0 +1,9 @@ +{% if test_scenario['values']['openshift'] == 'true' %} +# see https://github.com/stackabletech/issues/issues/566 +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl patch namespace $NAMESPACE -p '{"metadata":{"labels":{"pod-security.kubernetes.io/enforce":"privileged"}}}' + timeout: 120 +{% endif %} diff --git a/tests/templates/kuttl/shutdown/01-assert.yaml.j2 b/tests/templates/kuttl/shutdown/01-assert.yaml.j2 new file mode 100644 index 00000000..50b1d4c3 --- /dev/null +++ b/tests/templates/kuttl/shutdown/01-assert.yaml.j2 @@ -0,0 +1,10 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +{% if lookup('env', 'VECTOR_AGGREGATOR') %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-discovery +{% endif %} diff --git a/tests/templates/kuttl/shutdown/01-install-vector-aggregator-discovery-configmap.yaml.j2 b/tests/templates/kuttl/shutdown/01-install-vector-aggregator-discovery-configmap.yaml.j2 new file mode 100644 index 00000000..2d6a0df5 --- /dev/null +++ b/tests/templates/kuttl/shutdown/01-install-vector-aggregator-discovery-configmap.yaml.j2 @@ -0,0 +1,9 @@ +{% if lookup('env', 'VECTOR_AGGREGATOR') %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-discovery +data: + ADDRESS: {{ lookup('env', 'VECTOR_AGGREGATOR') }} +{% endif %} diff --git a/tests/templates/kuttl/shutdown/10-assert.yaml b/tests/templates/kuttl/shutdown/10-assert.yaml new file mode 100644 index 00000000..a1e216b4 --- /dev/null +++ b/tests/templates/kuttl/shutdown/10-assert.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: install-zk +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-zk-server-default +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/shutdown/10-install-zookeeper.yaml.j2 b/tests/templates/kuttl/shutdown/10-install-zookeeper.yaml.j2 new file mode 100644 index 00000000..e5f6ec65 --- /dev/null +++ b/tests/templates/kuttl/shutdown/10-install-zookeeper.yaml.j2 @@ -0,0 +1,30 @@ +--- +apiVersion: zookeeper.stackable.tech/v1alpha1 +kind: ZookeeperCluster +metadata: + name: test-zk +spec: + image: + productVersion: "{{ test_scenario['values']['zookeeper-latest'] }}" + pullPolicy: IfNotPresent + clusterConfig: + listenerClass: "cluster-internal" +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + servers: + config: + gracefulShutdownTimeout: 1m + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 1 +--- +apiVersion: zookeeper.stackable.tech/v1alpha1 +kind: ZookeeperZnode +metadata: + name: test-znode +spec: + clusterRef: + name: test-zk diff --git a/tests/templates/kuttl/shutdown/20-assert.yaml b/tests/templates/kuttl/shutdown/20-assert.yaml new file mode 100644 index 00000000..8800b24d --- /dev/null +++ b/tests/templates/kuttl/shutdown/20-assert.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: install-hdfs +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hdfs-namenode-default +status: + readyReplicas: 2 + replicas: 2 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hdfs-journalnode-default +status: + readyReplicas: 1 + replicas: 1 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hdfs-datanode-default +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/shutdown/20-install-hdfs.yaml.j2 b/tests/templates/kuttl/shutdown/20-install-hdfs.yaml.j2 new file mode 100644 index 00000000..844c4b27 --- /dev/null +++ b/tests/templates/kuttl/shutdown/20-install-hdfs.yaml.j2 @@ -0,0 +1,40 @@ +--- +apiVersion: hdfs.stackable.tech/v1alpha1 +kind: HdfsCluster +metadata: + name: test-hdfs +spec: + image: + productVersion: "{{ test_scenario['values']['hdfs-latest'] }}" + pullPolicy: IfNotPresent + clusterConfig: + zookeeperConfigMapName: test-znode +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + nameNodes: + config: + gracefulShutdownTimeout: 1m + listenerClass: "cluster-internal" + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 2 + dataNodes: + config: + gracefulShutdownTimeout: 1m + listenerClass: "cluster-internal" + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 1 + journalNodes: + config: + gracefulShutdownTimeout: 1m + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 1 diff --git a/tests/templates/kuttl/shutdown/30-assert.yaml b/tests/templates/kuttl/shutdown/30-assert.yaml new file mode 100644 index 00000000..4331bc64 --- /dev/null +++ b/tests/templates/kuttl/shutdown/30-assert.yaml @@ -0,0 +1,69 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: install-hbase +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hbase-master-default +spec: + template: + spec: + terminationGracePeriodSeconds: 60 +status: + readyReplicas: 2 + replicas: 2 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hbase-regionserver-default +spec: + template: + spec: + terminationGracePeriodSeconds: 120 +status: + readyReplicas: 2 + replicas: 2 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-hbase-restserver-default +spec: + template: + spec: + terminationGracePeriodSeconds: 60 +status: + readyReplicas: 2 + replicas: 2 +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: test-hbase-master +status: + expectedPods: 2 + currentHealthy: 2 + disruptionsAllowed: 1 +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: test-hbase-regionserver +status: + expectedPods: 2 + currentHealthy: 2 + disruptionsAllowed: 1 +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: test-hbase-restserver +status: + expectedPods: 2 + currentHealthy: 2 + disruptionsAllowed: 1 diff --git a/tests/templates/kuttl/shutdown/30-install-hbase.yaml.j2 b/tests/templates/kuttl/shutdown/30-install-hbase.yaml.j2 new file mode 100644 index 00000000..4ef58b80 --- /dev/null +++ b/tests/templates/kuttl/shutdown/30-install-hbase.yaml.j2 @@ -0,0 +1,61 @@ +--- +apiVersion: hbase.stackable.tech/v1alpha1 +kind: HbaseCluster +metadata: + name: test-hbase +spec: + image: +{% if test_scenario['values']['hbase'].find(",") > 0 %} + custom: "{{ test_scenario['values']['hbase'].split(',')[1] }}" + productVersion: "{{ test_scenario['values']['hbase'].split(',')[0] }}" +{% else %} + productVersion: "{{ test_scenario['values']['hbase'] }}" +{% endif %} + pullPolicy: IfNotPresent + clusterConfig: + hdfsConfigMapName: test-hdfs + zookeeperConfigMapName: test-znode + listenerClass: "cluster-internal" +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + masters: + config: + gracefulShutdownTimeout: 1m + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 2 + configOverrides: + hbase-site.xml: + # The Hbase master will take 10 minutes to observe a region server as "crashed" + # This gives the region mover enough time to run and ensures that masters do not interfere. + zookeeper.session.timeout: "600000" + + # Prevent the master from re-assigning the region when the region server is + # gone. Otherwise, the test case would not fail if the region mover fails. The + # default retainment wait period is larger than the test step timeout. This + # works only for HBase 2.6 (https://issues.apache.org/jira/browse/HBASE-27551). + hbase.master.scp.retain.assignment: "true" + hbase.master.scp.retain.assignment.force: "true" + regionServers: + config: + gracefulShutdownTimeout: 2m # one minute for the region mover + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + regionMover: + runBeforeShutdown: true + ack: true + maxThreads: 1 + roleGroups: + default: + replicas: 2 + restServers: + config: + gracefulShutdownTimeout: 1m + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 2 diff --git a/tests/templates/kuttl/shutdown/50-assert.yaml b/tests/templates/kuttl/shutdown/50-assert.yaml new file mode 100644 index 00000000..7ca054c0 --- /dev/null +++ b/tests/templates/kuttl/shutdown/50-assert.yaml @@ -0,0 +1,18 @@ +--- +# This test works as follows: +# - given +# - an HBase cluster with two region servers (0 and 1) +# - create a table + column familiy with 15 regions +# - where region server 0 has some regions assigned to it +# - restart server 0 (the region mover is triggerred by the shutdown) +# - assert that server 1 now hosts all 15 regions +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: test-hbase +commands: + - script: kubectl exec --namespace=$NAMESPACE test-hbase-master-default-0 -- /tmp/create_regions.sh + - script: kubectl delete --namespace=$NAMESPACE pod/test-hbase-regionserver-default-0 + - script: sleep 10 + - script: kubectl exec --namespace=$NAMESPACE test-hbase-master-default-0 -- /tmp/count_regions.sh +timeout: 240 diff --git a/tests/templates/kuttl/shutdown/50-test-hbase.yaml b/tests/templates/kuttl/shutdown/50-test-hbase.yaml new file mode 100644 index 00000000..6c5d290a --- /dev/null +++ b/tests/templates/kuttl/shutdown/50-test-hbase.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp --namespace=$NAMESPACE ./create_regions.sh test-hbase-master-default-0:/tmp + - script: kubectl cp --namespace=$NAMESPACE ./count_regions.sh test-hbase-master-default-0:/tmp diff --git a/tests/templates/kuttl/shutdown/count_regions.sh b/tests/templates/kuttl/shutdown/count_regions.sh new file mode 100755 index 00000000..9ca3bd47 --- /dev/null +++ b/tests/templates/kuttl/shutdown/count_regions.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# +# Count the number of regions on server 1. +# It should contain all 15 regions after region server 0 has been restarted. +# +set -euo 'pipefail' +set -x + +REGION_COUNT_ON_1=$(echo "list_regions 't1'" | /stackable/hbase/bin/hbase shell --noninteractive | grep -c test-hbase-regionserver-default-1) + +test "${REGION_COUNT_ON_1}" -eq 15 diff --git a/tests/templates/kuttl/shutdown/create_regions.sh b/tests/templates/kuttl/shutdown/create_regions.sh new file mode 100755 index 00000000..122fce27 --- /dev/null +++ b/tests/templates/kuttl/shutdown/create_regions.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# +# Create a table with 15 regions and count the number of regions on server 0. +# It should be more than 0. +# +set -x + +# We need to check if t1 exists first before creating the table. +# The table might already be there if in a previous run, the final test +# for regions on server 0 fails. +# This can happen if Hbase didn't get to assign anything there yet and +# so kuttl re-runs this test step. +T1_EXISTS=$(echo "list" | /stackable/hbase/bin/hbase shell --noninteractive | grep -c t1) +if [ "$T1_EXISTS" == "0" ]; then + /stackable/hbase/bin/hbase shell --noninteractive <<'EOF' +balance_switch false; +create 't1', 'f1', {NUMREGIONS => 15, SPLITALGO => 'HexStringSplit'}; +EOF +fi + +REGION_COUNT_ON_0=$(echo "list_regions 't1'" | /stackable/hbase/bin/hbase shell --noninteractive | grep -c test-hbase-regionserver-default-0) + +test "${REGION_COUNT_ON_0}" -ge 0 diff --git a/tests/templates/kuttl/smoke/10-install-zookeeper.yaml.j2 b/tests/templates/kuttl/smoke/10-install-zookeeper.yaml.j2 index dc9baea2..0d86426b 100644 --- a/tests/templates/kuttl/smoke/10-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/smoke/10-install-zookeeper.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/smoke/20-install-hdfs.yaml.j2 b/tests/templates/kuttl/smoke/20-install-hdfs.yaml.j2 index 48fafdc9..1d1b9af6 100644 --- a/tests/templates/kuttl/smoke/20-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/smoke/20-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m listenerClass: {{ test_scenario['values']['listener-class'] }} logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -22,6 +23,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m listenerClass: {{ test_scenario['values']['listener-class'] }} logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -30,6 +32,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/snapshot-export/11-install-zookeeper.yaml.j2 b/tests/templates/kuttl/snapshot-export/11-install-zookeeper.yaml.j2 index caecd8b8..8917b85a 100644 --- a/tests/templates/kuttl/snapshot-export/11-install-zookeeper.yaml.j2 +++ b/tests/templates/kuttl/snapshot-export/11-install-zookeeper.yaml.j2 @@ -13,6 +13,7 @@ spec: {% endif %} servers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/snapshot-export/12-install-hdfs.yaml.j2 b/tests/templates/kuttl/snapshot-export/12-install-hdfs.yaml.j2 index 0bd3bb65..8a9a4bc0 100644 --- a/tests/templates/kuttl/snapshot-export/12-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/snapshot-export/12-install-hdfs.yaml.j2 @@ -14,6 +14,7 @@ spec: {% endif %} nameNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -21,6 +22,7 @@ spec: replicas: 2 dataNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -28,6 +30,7 @@ spec: replicas: 1 journalNodes: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/templates/kuttl/snapshot-export/20-install-hbase.yaml.j2 b/tests/templates/kuttl/snapshot-export/20-install-hbase.yaml.j2 index 66529561..bbbd0673 100644 --- a/tests/templates/kuttl/snapshot-export/20-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/snapshot-export/20-install-hbase.yaml.j2 @@ -20,6 +20,7 @@ spec: {% endif %} masters: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -27,6 +28,7 @@ spec: replicas: 1 regionServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: @@ -34,6 +36,7 @@ spec: replicas: 1 restServers: config: + gracefulShutdownTimeout: 1m logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} roleGroups: diff --git a/tests/test-definition.yaml b/tests/test-definition.yaml index a000c60f..8b8821d0 100644 --- a/tests/test-definition.yaml +++ b/tests/test-definition.yaml @@ -134,6 +134,12 @@ tests: - zookeeper-latest - omid - openshift + - name: shutdown + dimensions: + - hbase + - hdfs-latest + - zookeeper-latest + - openshift suites: - name: nightly patch: