Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ All notable changes to this project will be documented in this file.
- The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator
- The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward
- This is marked as breaking because tools and policies might exist, which require these fields to be set
- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history pods do not expose metrics anymore ([#584])

### Fixed

Expand All @@ -59,6 +60,7 @@ All notable changes to this project will be documented in this file.
[#574]: https://github.com/stackabletech/spark-k8s-operator/pull/574
[#580]: https://github.com/stackabletech/spark-k8s-operator/pull/580
[#575]: https://github.com/stackabletech/spark-k8s-operator/pull/575
[#584]: https://github.com/stackabletech/spark-k8s-operator/pull/584

## [25.3.0] - 2025-03-21

Expand Down
8 changes: 8 additions & 0 deletions docs/modules/spark-k8s/pages/usage-guide/history-server.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,11 @@ spark-history-node-cleaner NodePort 10.96.203.43 <none> 18080:325
By setting up port forwarding on 18080 the UI can be opened by pointing your browser to `http://localhost:18080`:

image::history-server-ui.png[History Server Console]

== Metrics

[NOTE]
====
Up to version 25.3 of the Stackable Data Platform, the history server used the JMX exporter to expose metrics on a separate port.
Starting with version 25.7 the JMX exporter has been removed and the history server doesn't expose metrics as of Spark version 3.5.6.
====
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,18 @@ As the operator creates the necessary resources, the status of the application t

NOTE: The operator never reconciles an application once it has been created.
To resubmit an application, a new SparkApplication resource must be created.

== Metrics

[NOTE]
====
Up to version 25.3 of the Stackable Data Platform, Spark applications used the JMX exporter to expose metrics on a separate port.
Starting with version 25.7, the built-in Prometheus servlet is used instead.
====

Application driver pods expose Prometheus metrics at the following endpoints:

* `/metrics/prometheus` for driver instances
* `/metrics/executors/prometheus` for executor instances.

These endpoints are available on the same port as the Spark UI, which is 4040 by default.
7 changes: 6 additions & 1 deletion rust/operator-binary/src/connect/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ pub enum Error {
JvmSecurityProperties {
source: product_config::writer::PropertiesWriterError,
},

#[snafu(display("failed to serialize metrics properties",))]
MetricsProperties {
source: product_config::writer::PropertiesWriterError,
},
}

pub(crate) fn labels<'a, T>(
Expand Down Expand Up @@ -149,5 +154,5 @@ pub(crate) fn metrics_properties(
);
}

to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu)
to_java_properties_string(result.iter()).context(MetricsPropertiesSnafu)
}
1 change: 0 additions & 1 deletion rust/operator-binary/src/crd/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ pub const SPARK_DEFAULTS_FILE_NAME: &str = "spark-defaults.conf";
pub const SPARK_ENV_SH_FILE_NAME: &str = "spark-env.sh";

pub const SPARK_CLUSTER_ROLE: &str = "spark-k8s-clusterrole";
pub const METRICS_PORT: u16 = 18081;
pub const HISTORY_UI_PORT: u16 = 18080;

pub const LISTENER_VOLUME_NAME: &str = "listener";
Expand Down
8 changes: 8 additions & 0 deletions rust/operator-binary/src/crd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,14 @@ impl v1alpha1::SparkApplication {
)]);
}

// Enable Prometheus metrics export
submit_cmd.extend(vec![
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet".to_string(),
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.path=/metrics/prometheus".to_string(),
"--conf spark.ui.prometheus.enabled=true".to_string(),
"--conf spark.sql.streaming.metricsEnabled=true".to_string(),
]);

// some command elements need to be initially stored in a map (to allow overwrites) and
// then added to the vector once complete.
let mut submit_conf: BTreeMap<String, String> = BTreeMap::new();
Expand Down
12 changes: 3 additions & 9 deletions rust/operator-binary/src/history/config/jvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ use stackable_operator::role_utils::{

use crate::crd::{
constants::{
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, METRICS_PORT,
STACKABLE_TLS_STORE_PASSWORD, STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG,
VOLUME_MOUNT_PATH_LOG_CONFIG,
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, STACKABLE_TLS_STORE_PASSWORD,
STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG_CONFIG,
},
history::HistoryConfigFragment,
logdir::ResolvedLogDir,
Expand All @@ -33,9 +32,6 @@ pub fn construct_history_jvm_args(
format!(
"-Djava.security.properties={VOLUME_MOUNT_PATH_CONFIG}/{JVM_SECURITY_PROPERTIES_FILE}"
),
format!(
"-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/config.yaml"
),
];

if logdir.tls_enabled() {
Expand Down Expand Up @@ -86,8 +82,7 @@ mod tests {
assert_eq!(
jvm_config,
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
-Djava.security.properties=/stackable/spark/conf/security.properties \
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml"
-Djava.security.properties=/stackable/spark/conf/security.properties"
);
}

Expand Down Expand Up @@ -130,7 +125,6 @@ mod tests {
jvm_config,
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
-Djava.security.properties=/stackable/spark/conf/security.properties \
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml \
-Dhttps.proxyHost=proxy.my.corp \
-Djava.net.preferIPv4Stack=true \
-Dhttps.proxyPort=1234"
Expand Down
11 changes: 5 additions & 6 deletions rust/operator-binary/src/history/history_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ use crate::{
constants::{
ACCESS_KEY_ID, HISTORY_APP_NAME, HISTORY_CONTROLLER_NAME, HISTORY_ROLE_NAME,
HISTORY_UI_PORT, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR,
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, METRICS_PORT, OPERATOR_NAME,
SECRET_ACCESS_KEY, SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME,
SPARK_IMAGE_BASE_NAME, STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG,
VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG,
VOLUME_MOUNT_PATH_LOG, VOLUME_MOUNT_PATH_LOG_CONFIG,
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, OPERATOR_NAME, SECRET_ACCESS_KEY,
SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, SPARK_IMAGE_BASE_NAME,
STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, VOLUME_MOUNT_NAME_LOG,
VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG,
VOLUME_MOUNT_PATH_LOG_CONFIG,
},
history::{self, HistoryConfig, SparkHistoryServerContainer, v1alpha1},
listener_ext,
Expand Down Expand Up @@ -574,7 +574,6 @@ fn build_stateful_set(
])
.args(command_args(log_dir))
.add_container_port("http", HISTORY_UI_PORT.into())
.add_container_port("metrics", METRICS_PORT.into())
.add_env_vars(merged_env)
.add_volume_mounts(log_dir.volume_mounts())
.context(AddVolumeMountSnafu)?
Expand Down
46 changes: 26 additions & 20 deletions rust/operator-binary/src/spark_k8s_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use stackable_operator::{
core::{DeserializeGuard, error_boundary},
runtime::{controller::Action, reflector::ObjectRef},
},
kvp::Label,
logging::controller::ReconcilerError,
product_config_utils::ValidatedRoleConfigByPropertyKind,
product_logging::{
Expand Down Expand Up @@ -610,27 +611,32 @@ fn pod_template(
);
}

let mut omb = ObjectMetaBuilder::new();
omb.name(&container_name)
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
.ownerreference_from_resource(spark_application, None, None)
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(
spark_application
.build_recommended_labels(&spark_image.app_version_label, &container_name),
)
.context(MetadataBuildSnafu)?;

// Only the driver pod should be scraped by Prometheus
// because the executor metrics are also available via /metrics/executors/prometheus/
if role == SparkApplicationRole::Driver {
omb.with_label(Label::try_from(("prometheus.io/scrape", "true")).context(LabelBuildSnafu)?);
}

let mut pb = PodBuilder::new();
pb.metadata(
ObjectMetaBuilder::new()
.name(&container_name)
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
.ownerreference_from_resource(spark_application, None, None)
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(
spark_application
.build_recommended_labels(&spark_image.app_version_label, &container_name),
)
.context(MetadataBuildSnafu)?
.build(),
)
.add_container(cb.build())
.add_volumes(volumes.to_vec())
.context(AddVolumeSnafu)?
.security_context(security_context())
.image_pull_secrets_from_product_image(spark_image)
.affinity(&config.affinity);
pb.metadata(omb.build())
.add_container(cb.build())
.add_volumes(volumes.to_vec())
.context(AddVolumeSnafu)?
.security_context(security_context())
.image_pull_secrets_from_product_image(spark_image)
.affinity(&config.affinity);

let init_containers = init_containers(
spark_application,
Expand Down
14 changes: 14 additions & 0 deletions tests/templates/kuttl/spark-connect/30-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 300
commands:
- script: |
# This endpoint (/metrics/prometheus) is also used as liveliness probe
echo test Prometheus endpoint for driver metrics
DRIVER_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/prometheus | grep _driver_ | wc -l)
test 0 -lt "$DRIVER_METRIC_COUNT"

echo test Prometheus endpoint for executor metrics
EXECUTOR_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/executors/prometheus | grep _executor_ | wc -l)
test 0 -lt "$EXECUTOR_METRIC_COUNT"