diff --git a/CHANGELOG.md b/CHANGELOG.md index bfc74d90..7a2f6e30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ All notable changes to this project will be documented in this file. by `FILE_LOG_DIRECTORY` (or via `--file-log-directory `). - Replace stackable-operator `print_startup_string` with `tracing::info!` with fields. - BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead - of having the operator write it to the vector config ([#551]). + of having the operator write it to the vector config ([#551]). - Document that Spark Connect doesn't integrate with the history server ([#559]) - test: Bump to Vector `0.46.1` ([#565]). - Use versioned common structs ([#572]). @@ -33,7 +33,7 @@ All notable changes to this project will be documented in this file. - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator - The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward - This is marked as breaking because tools and policies might exist, which require these fields to be set -- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history server pods do not expose metrics anymore ([#584]) +- Enable the built-in Prometheus servlet. The jmx exporter was removed in ([#584]) but added back in ([#585]). ### Fixed @@ -61,6 +61,7 @@ All notable changes to this project will be documented in this file. [#580]: https://github.com/stackabletech/spark-k8s-operator/pull/580 [#575]: https://github.com/stackabletech/spark-k8s-operator/pull/575 [#584]: https://github.com/stackabletech/spark-k8s-operator/pull/584 +[#585]: https://github.com/stackabletech/spark-k8s-operator/pull/585 ## [25.3.0] - 2025-03-21 @@ -111,7 +112,7 @@ All notable changes to this project will be documented in this file. - BREAKING: The fields `connection` and `host` on `S3Connection` as well as `bucketName` on `S3Bucket`are now mandatory ([#472]). - Fix `envOverrides` for SparkApplication and SparkHistoryServer ([#451]). - Ensure SparkApplications can only create a single submit Job. Fix for #457 ([#460]). -- Invalid `SparkApplication`/`SparkHistoryServer` objects don't cause the operator to stop functioning (#[482]). +- Invalid `SparkApplication`/`SparkHistoryServer` objects don't cause the operator to stop functioning (#[482]). ### Removed @@ -186,7 +187,7 @@ All notable changes to this project will be documented in this file. - Support PodDisruptionBudgets for HistoryServer ([#288]). - Support for versions 3.4.1, 3.5.0 ([#291]). - History server now exports metrics via jmx exporter (port 18081) ([#291]). -- Document graceful shutdown ([#306]). +- Document graceful shutdown ([#306]). ### Changed diff --git a/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc index 276aafc6..d099505d 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc @@ -161,6 +161,6 @@ image::history-server-ui.png[History Server Console] [NOTE] ==== -Up to version 25.3 of the Stackable Data Platform, the history server used the JMX exporter to expose metrics on a separate port. -Starting with version 25.7 the JMX exporter has been removed and the history server doesn't expose metrics as of Spark version 3.5.6. +Starting with version 25.7, the built-in Prometheus servlet is enabled in addition to the existing JMX exporter. +The JMX exporter is still available but it is deprecated and will be removed in a future release. ==== diff --git a/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc b/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc index 08367f0f..d955a3ca 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc @@ -13,8 +13,8 @@ To resubmit an application, a new SparkApplication resource must be created. [NOTE] ==== -Up to version 25.3 of the Stackable Data Platform, Spark applications used the JMX exporter to expose metrics on a separate port. -Starting with version 25.7, the built-in Prometheus servlet is used instead. +Starting with version 25.7, the built-in Prometheus servlet is enabled. +The JMX exporter is available but not used for applications. It has never been used automatically for applications and now it is deprecated. ==== Application driver pods expose Prometheus metrics at the following endpoints: diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 25947792..7e32e392 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -86,6 +86,7 @@ pub const SPARK_DEFAULTS_FILE_NAME: &str = "spark-defaults.conf"; pub const SPARK_ENV_SH_FILE_NAME: &str = "spark-env.sh"; pub const SPARK_CLUSTER_ROLE: &str = "spark-k8s-clusterrole"; +pub const METRICS_PORT: u16 = 18081; pub const HISTORY_UI_PORT: u16 = 18080; pub const LISTENER_VOLUME_NAME: &str = "listener"; diff --git a/rust/operator-binary/src/history/config/jvm.rs b/rust/operator-binary/src/history/config/jvm.rs index dd23cf21..4022a3ad 100644 --- a/rust/operator-binary/src/history/config/jvm.rs +++ b/rust/operator-binary/src/history/config/jvm.rs @@ -5,8 +5,9 @@ use stackable_operator::role_utils::{ use crate::crd::{ constants::{ - JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, STACKABLE_TLS_STORE_PASSWORD, - STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG_CONFIG, + JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, METRICS_PORT, + STACKABLE_TLS_STORE_PASSWORD, STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, + VOLUME_MOUNT_PATH_LOG_CONFIG, }, history::HistoryConfigFragment, logdir::ResolvedLogDir, @@ -32,6 +33,9 @@ pub fn construct_history_jvm_args( format!( "-Djava.security.properties={VOLUME_MOUNT_PATH_CONFIG}/{JVM_SECURITY_PROPERTIES_FILE}" ), + format!( + "-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/config.yaml" + ), ]; if logdir.tls_enabled() { @@ -82,7 +86,8 @@ mod tests { assert_eq!( jvm_config, "-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \ - -Djava.security.properties=/stackable/spark/conf/security.properties" + -Djava.security.properties=/stackable/spark/conf/security.properties \ + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml" ); } @@ -125,6 +130,7 @@ mod tests { jvm_config, "-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \ -Djava.security.properties=/stackable/spark/conf/security.properties \ + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml \ -Dhttps.proxyHost=proxy.my.corp \ -Djava.net.preferIPv4Stack=true \ -Dhttps.proxyPort=1234" diff --git a/rust/operator-binary/src/history/history_controller.rs b/rust/operator-binary/src/history/history_controller.rs index c07f90f7..55b17c5d 100644 --- a/rust/operator-binary/src/history/history_controller.rs +++ b/rust/operator-binary/src/history/history_controller.rs @@ -56,11 +56,11 @@ use crate::{ constants::{ ACCESS_KEY_ID, HISTORY_APP_NAME, HISTORY_CONTROLLER_NAME, HISTORY_ROLE_NAME, HISTORY_UI_PORT, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, - LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, OPERATOR_NAME, SECRET_ACCESS_KEY, - SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, SPARK_IMAGE_BASE_NAME, - STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, VOLUME_MOUNT_NAME_LOG, - VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG, - VOLUME_MOUNT_PATH_LOG_CONFIG, + LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, METRICS_PORT, OPERATOR_NAME, + SECRET_ACCESS_KEY, SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, + SPARK_IMAGE_BASE_NAME, STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, + VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, + VOLUME_MOUNT_PATH_LOG, VOLUME_MOUNT_PATH_LOG_CONFIG, }, history::{self, HistoryConfig, SparkHistoryServerContainer, v1alpha1}, listener_ext, @@ -574,6 +574,7 @@ fn build_stateful_set( ]) .args(command_args(log_dir)) .add_container_port("http", HISTORY_UI_PORT.into()) + .add_container_port("metrics", METRICS_PORT.into()) .add_env_vars(merged_env) .add_volume_mounts(log_dir.volume_mounts()) .context(AddVolumeMountSnafu)?