From dae8014d38f8b63580bd07d877ef1e11f66f4416 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:04:10 +0200 Subject: [PATCH 1/7] remove references to the jmx exporter agent --- rust/operator-binary/src/crd/constants.rs | 1 - rust/operator-binary/src/history/config/jvm.rs | 12 +++--------- .../src/history/history_controller.rs | 11 +++++------ 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 7e32e392..25947792 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -86,7 +86,6 @@ pub const SPARK_DEFAULTS_FILE_NAME: &str = "spark-defaults.conf"; pub const SPARK_ENV_SH_FILE_NAME: &str = "spark-env.sh"; pub const SPARK_CLUSTER_ROLE: &str = "spark-k8s-clusterrole"; -pub const METRICS_PORT: u16 = 18081; pub const HISTORY_UI_PORT: u16 = 18080; pub const LISTENER_VOLUME_NAME: &str = "listener"; diff --git a/rust/operator-binary/src/history/config/jvm.rs b/rust/operator-binary/src/history/config/jvm.rs index 4022a3ad..dd23cf21 100644 --- a/rust/operator-binary/src/history/config/jvm.rs +++ b/rust/operator-binary/src/history/config/jvm.rs @@ -5,9 +5,8 @@ use stackable_operator::role_utils::{ use crate::crd::{ constants::{ - JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, METRICS_PORT, - STACKABLE_TLS_STORE_PASSWORD, STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, - VOLUME_MOUNT_PATH_LOG_CONFIG, + JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, STACKABLE_TLS_STORE_PASSWORD, + STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG_CONFIG, }, history::HistoryConfigFragment, logdir::ResolvedLogDir, @@ -33,9 +32,6 @@ pub fn construct_history_jvm_args( format!( "-Djava.security.properties={VOLUME_MOUNT_PATH_CONFIG}/{JVM_SECURITY_PROPERTIES_FILE}" ), - format!( - "-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/config.yaml" - ), ]; if logdir.tls_enabled() { @@ -86,8 +82,7 @@ mod tests { assert_eq!( jvm_config, "-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \ - -Djava.security.properties=/stackable/spark/conf/security.properties \ - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml" + -Djava.security.properties=/stackable/spark/conf/security.properties" ); } @@ -130,7 +125,6 @@ mod tests { jvm_config, "-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \ -Djava.security.properties=/stackable/spark/conf/security.properties \ - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml \ -Dhttps.proxyHost=proxy.my.corp \ -Djava.net.preferIPv4Stack=true \ -Dhttps.proxyPort=1234" diff --git a/rust/operator-binary/src/history/history_controller.rs b/rust/operator-binary/src/history/history_controller.rs index 55b17c5d..c07f90f7 100644 --- a/rust/operator-binary/src/history/history_controller.rs +++ b/rust/operator-binary/src/history/history_controller.rs @@ -56,11 +56,11 @@ use crate::{ constants::{ ACCESS_KEY_ID, HISTORY_APP_NAME, HISTORY_CONTROLLER_NAME, HISTORY_ROLE_NAME, HISTORY_UI_PORT, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, - LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, METRICS_PORT, OPERATOR_NAME, - SECRET_ACCESS_KEY, SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, - SPARK_IMAGE_BASE_NAME, STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, - VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, - VOLUME_MOUNT_PATH_LOG, VOLUME_MOUNT_PATH_LOG_CONFIG, + LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, OPERATOR_NAME, SECRET_ACCESS_KEY, + SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, SPARK_IMAGE_BASE_NAME, + STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, VOLUME_MOUNT_NAME_LOG, + VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG, + VOLUME_MOUNT_PATH_LOG_CONFIG, }, history::{self, HistoryConfig, SparkHistoryServerContainer, v1alpha1}, listener_ext, @@ -574,7 +574,6 @@ fn build_stateful_set( ]) .args(command_args(log_dir)) .add_container_port("http", HISTORY_UI_PORT.into()) - .add_container_port("metrics", METRICS_PORT.into()) .add_env_vars(merged_env) .add_volume_mounts(log_dir.volume_mounts()) .context(AddVolumeMountSnafu)? From 38a60f647924723befb9b80a20acda768d48deb9 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:01:04 +0200 Subject: [PATCH 2/7] add test for connect Prometheus metrics endpoint --- tests/templates/kuttl/spark-connect/30-assert.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 tests/templates/kuttl/spark-connect/30-assert.yaml diff --git a/tests/templates/kuttl/spark-connect/30-assert.yaml b/tests/templates/kuttl/spark-connect/30-assert.yaml new file mode 100644 index 00000000..f77806e6 --- /dev/null +++ b/tests/templates/kuttl/spark-connect/30-assert.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 300 +commands: + - script: | + # This endpoint (/metrics/prometheus) is also used as liveliness probe + echo test Prometheus endpoint for driver metrics + DRIVER_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/prometheus | grep _driver_ | wc -l) + test 0 -lt "$DRIVER_METRIC_COUNT" + + echo test Prometheus endpoint for executor metrics + EXECUTOR_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/executors/prometheus | grep _executor_ | wc -l) + test 0 -lt "$EXECUTOR_METRIC_COUNT" From 2bb8dff1bf9dd811cfbbc7f49a551f1aa829e349 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:59:08 +0200 Subject: [PATCH 3/7] use different error variant for metric serialization --- rust/operator-binary/src/connect/common.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rust/operator-binary/src/connect/common.rs b/rust/operator-binary/src/connect/common.rs index ef6f3608..c38836c6 100644 --- a/rust/operator-binary/src/connect/common.rs +++ b/rust/operator-binary/src/connect/common.rs @@ -34,6 +34,11 @@ pub enum Error { JvmSecurityProperties { source: product_config::writer::PropertiesWriterError, }, + + #[snafu(display("failed to serialize metrics properties",))] + MetricsProperties { + source: product_config::writer::PropertiesWriterError, + }, } pub(crate) fn labels<'a, T>( @@ -149,5 +154,5 @@ pub(crate) fn metrics_properties( ); } - to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu) + to_java_properties_string(result.iter()).context(MetricsPropertiesSnafu) } From 2ab2d4f313daaa9dd68451feb7d713d16957efed Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 12 Jun 2025 14:11:02 +0200 Subject: [PATCH 4/7] applications export Prometheus metrics --- rust/operator-binary/src/crd/mod.rs | 8 ++++ .../src/spark_k8s_controller.rs | 46 +++++++++++-------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 629d20f4..9698fb48 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -649,6 +649,14 @@ impl v1alpha1::SparkApplication { )]); } + // Enable Prometheus metrics export + submit_cmd.extend(vec![ + "--conf spark.metrics.conf.\\*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet".to_string(), + "--conf spark.metrics.conf.\\*.sink.prometheusServlet.path=/metrics/prometheus".to_string(), + "--conf spark.ui.prometheus.enabled=true".to_string(), + "--conf spark.sql.streaming.metricsEnabled=true".to_string(), + ]); + // some command elements need to be initially stored in a map (to allow overwrites) and // then added to the vector once complete. let mut submit_conf: BTreeMap = BTreeMap::new(); diff --git a/rust/operator-binary/src/spark_k8s_controller.rs b/rust/operator-binary/src/spark_k8s_controller.rs index 7fcbf36d..e13cc57b 100644 --- a/rust/operator-binary/src/spark_k8s_controller.rs +++ b/rust/operator-binary/src/spark_k8s_controller.rs @@ -37,6 +37,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, reflector::ObjectRef}, }, + kvp::Label, logging::controller::ReconcilerError, product_config_utils::ValidatedRoleConfigByPropertyKind, product_logging::{ @@ -610,27 +611,32 @@ fn pod_template( ); } + let mut omb = ObjectMetaBuilder::new(); + omb.name(&container_name) + // this reference is not pointing to a controller but only provides a UID that can used to clean up resources + // cleanly (specifically driver pods and related config maps) when the spark application is deleted. + .ownerreference_from_resource(spark_application, None, None) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels( + spark_application + .build_recommended_labels(&spark_image.app_version_label, &container_name), + ) + .context(MetadataBuildSnafu)?; + + // Only the driver pod should be scraped by Prometheus + // because the executor metrics are also available via /metrics/executors/prometheus/ + if role == SparkApplicationRole::Driver { + omb.with_label(Label::try_from(("prometheus.io/scrape", "true")).context(LabelBuildSnafu)?); + } + let mut pb = PodBuilder::new(); - pb.metadata( - ObjectMetaBuilder::new() - .name(&container_name) - // this reference is not pointing to a controller but only provides a UID that can used to clean up resources - // cleanly (specifically driver pods and related config maps) when the spark application is deleted. - .ownerreference_from_resource(spark_application, None, None) - .context(ObjectMissingMetadataForOwnerRefSnafu)? - .with_recommended_labels( - spark_application - .build_recommended_labels(&spark_image.app_version_label, &container_name), - ) - .context(MetadataBuildSnafu)? - .build(), - ) - .add_container(cb.build()) - .add_volumes(volumes.to_vec()) - .context(AddVolumeSnafu)? - .security_context(security_context()) - .image_pull_secrets_from_product_image(spark_image) - .affinity(&config.affinity); + pb.metadata(omb.build()) + .add_container(cb.build()) + .add_volumes(volumes.to_vec()) + .context(AddVolumeSnafu)? + .security_context(security_context()) + .image_pull_secrets_from_product_image(spark_image) + .affinity(&config.affinity); let init_containers = init_containers( spark_application, From ad8fa189c3f90c65030e0f5e6019fecc102bf4d6 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 12 Jun 2025 14:47:39 +0200 Subject: [PATCH 5/7] update docs --- .../pages/usage-guide/history-server.adoc | 8 ++++++++ .../usage-guide/operations/applications.adoc | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc index 52a51a4f..276aafc6 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc @@ -156,3 +156,11 @@ spark-history-node-cleaner NodePort 10.96.203.43 18080:325 By setting up port forwarding on 18080 the UI can be opened by pointing your browser to `http://localhost:18080`: image::history-server-ui.png[History Server Console] + +== Metrics + +[NOTE] +==== +Up to version 25.3 of the Stackable Data Platform, the history server used the JMX exporter to expose metrics on a separate port. +Starting with version 25.7 the JMX exporter has been removed and the history server doesn't expose metrics as of Spark version 3.5.6. +==== diff --git a/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc b/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc index cc85b738..08367f0f 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc @@ -8,3 +8,18 @@ As the operator creates the necessary resources, the status of the application t NOTE: The operator never reconciles an application once it has been created. To resubmit an application, a new SparkApplication resource must be created. + +== Metrics + +[NOTE] +==== +Up to version 25.3 of the Stackable Data Platform, Spark applications used the JMX exporter to expose metrics on a separate port. +Starting with version 25.7, the built-in Prometheus servlet is used instead. +==== + +Application driver pods expose Prometheus metrics at the following endpoints: + +* `/metrics/prometheus` for driver instances +* `/metrics/executors/prometheus` for executor instances. + +These endpoints are available on the same port as the Spark UI, which is 4040 by default. From d0d1f47cf287264af288693412bc0aa1d532f042 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 12 Jun 2025 14:51:41 +0200 Subject: [PATCH 6/7] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b923daa..28d43f37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ All notable changes to this project will be documented in this file. - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator - The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward - This is marked as breaking because tools and policies might exist, which require these fields to be set +- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history pods do not expose metrics anymore ([#584]) ### Fixed @@ -59,6 +60,7 @@ All notable changes to this project will be documented in this file. [#574]: https://github.com/stackabletech/spark-k8s-operator/pull/574 [#580]: https://github.com/stackabletech/spark-k8s-operator/pull/580 [#575]: https://github.com/stackabletech/spark-k8s-operator/pull/575 +[#584]: https://github.com/stackabletech/spark-k8s-operator/pull/584 ## [25.3.0] - 2025-03-21 From bb60f369331410513240f63426f36c474cb2010a Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:43:26 +0200 Subject: [PATCH 7/7] Update CHANGELOG.md Co-authored-by: Malte Sander --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28d43f37..bfc74d90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ All notable changes to this project will be documented in this file. - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator - The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward - This is marked as breaking because tools and policies might exist, which require these fields to be set -- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history pods do not expose metrics anymore ([#584]) +- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history server pods do not expose metrics anymore ([#584]) ### Fixed