Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ All notable changes to this project will be documented in this file.
- The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator
- The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward
- This is marked as breaking because tools and policies might exist, which require these fields to be set
- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history server pods do not expose metrics anymore ([#584])

### Fixed

Expand All @@ -59,6 +60,7 @@ All notable changes to this project will be documented in this file.
[#574]: https://github.com/stackabletech/spark-k8s-operator/pull/574
[#580]: https://github.com/stackabletech/spark-k8s-operator/pull/580
[#575]: https://github.com/stackabletech/spark-k8s-operator/pull/575
[#584]: https://github.com/stackabletech/spark-k8s-operator/pull/584

## [25.3.0] - 2025-03-21

Expand Down
8 changes: 8 additions & 0 deletions docs/modules/spark-k8s/pages/usage-guide/history-server.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,11 @@ spark-history-node-cleaner NodePort 10.96.203.43 <none> 18080:325
By setting up port forwarding on 18080 the UI can be opened by pointing your browser to `http://localhost:18080`:

image::history-server-ui.png[History Server Console]

== Metrics

[NOTE]
====
Up to version 25.3 of the Stackable Data Platform, the history server used the JMX exporter to expose metrics on a separate port.
Starting with version 25.7 the JMX exporter has been removed and the history server doesn't expose metrics as of Spark version 3.5.6.
====
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,18 @@ As the operator creates the necessary resources, the status of the application t

NOTE: The operator never reconciles an application once it has been created.
To resubmit an application, a new SparkApplication resource must be created.

== Metrics

[NOTE]
====
Up to version 25.3 of the Stackable Data Platform, Spark applications used the JMX exporter to expose metrics on a separate port.
Starting with version 25.7, the built-in Prometheus servlet is used instead.
====

Application driver pods expose Prometheus metrics at the following endpoints:

* `/metrics/prometheus` for driver instances
* `/metrics/executors/prometheus` for executor instances.

These endpoints are available on the same port as the Spark UI, which is 4040 by default.
7 changes: 6 additions & 1 deletion rust/operator-binary/src/connect/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ pub enum Error {
JvmSecurityProperties {
source: product_config::writer::PropertiesWriterError,
},

#[snafu(display("failed to serialize metrics properties",))]
MetricsProperties {
source: product_config::writer::PropertiesWriterError,
},
}

pub(crate) fn labels<'a, T>(
Expand Down Expand Up @@ -149,5 +154,5 @@ pub(crate) fn metrics_properties(
);
}

to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu)
to_java_properties_string(result.iter()).context(MetricsPropertiesSnafu)
}
1 change: 0 additions & 1 deletion rust/operator-binary/src/crd/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ pub const SPARK_DEFAULTS_FILE_NAME: &str = "spark-defaults.conf";
pub const SPARK_ENV_SH_FILE_NAME: &str = "spark-env.sh";

pub const SPARK_CLUSTER_ROLE: &str = "spark-k8s-clusterrole";
pub const METRICS_PORT: u16 = 18081;
pub const HISTORY_UI_PORT: u16 = 18080;

pub const LISTENER_VOLUME_NAME: &str = "listener";
Expand Down
8 changes: 8 additions & 0 deletions rust/operator-binary/src/crd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,14 @@ impl v1alpha1::SparkApplication {
)]);
}

// Enable Prometheus metrics export
submit_cmd.extend(vec![
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet".to_string(),
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.path=/metrics/prometheus".to_string(),
"--conf spark.ui.prometheus.enabled=true".to_string(),
"--conf spark.sql.streaming.metricsEnabled=true".to_string(),
]);

// some command elements need to be initially stored in a map (to allow overwrites) and
// then added to the vector once complete.
let mut submit_conf: BTreeMap<String, String> = BTreeMap::new();
Expand Down
12 changes: 3 additions & 9 deletions rust/operator-binary/src/history/config/jvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ use stackable_operator::role_utils::{

use crate::crd::{
constants::{
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, METRICS_PORT,
STACKABLE_TLS_STORE_PASSWORD, STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG,
VOLUME_MOUNT_PATH_LOG_CONFIG,
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, STACKABLE_TLS_STORE_PASSWORD,
STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG_CONFIG,
},
history::HistoryConfigFragment,
logdir::ResolvedLogDir,
Expand All @@ -33,9 +32,6 @@ pub fn construct_history_jvm_args(
format!(
"-Djava.security.properties={VOLUME_MOUNT_PATH_CONFIG}/{JVM_SECURITY_PROPERTIES_FILE}"
),
format!(
"-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/config.yaml"
),
];

if logdir.tls_enabled() {
Expand Down Expand Up @@ -86,8 +82,7 @@ mod tests {
assert_eq!(
jvm_config,
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
-Djava.security.properties=/stackable/spark/conf/security.properties \
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml"
-Djava.security.properties=/stackable/spark/conf/security.properties"
);
}

Expand Down Expand Up @@ -130,7 +125,6 @@ mod tests {
jvm_config,
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
-Djava.security.properties=/stackable/spark/conf/security.properties \
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml \
-Dhttps.proxyHost=proxy.my.corp \
-Djava.net.preferIPv4Stack=true \
-Dhttps.proxyPort=1234"
Expand Down
11 changes: 5 additions & 6 deletions rust/operator-binary/src/history/history_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ use crate::{
constants::{
ACCESS_KEY_ID, HISTORY_APP_NAME, HISTORY_CONTROLLER_NAME, HISTORY_ROLE_NAME,
HISTORY_UI_PORT, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR,
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, METRICS_PORT, OPERATOR_NAME,
SECRET_ACCESS_KEY, SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME,
SPARK_IMAGE_BASE_NAME, STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG,
VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG,
VOLUME_MOUNT_PATH_LOG, VOLUME_MOUNT_PATH_LOG_CONFIG,
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, OPERATOR_NAME, SECRET_ACCESS_KEY,
SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, SPARK_IMAGE_BASE_NAME,
STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, VOLUME_MOUNT_NAME_LOG,
VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG,
VOLUME_MOUNT_PATH_LOG_CONFIG,
},
history::{self, HistoryConfig, SparkHistoryServerContainer, v1alpha1},
listener_ext,
Expand Down Expand Up @@ -574,7 +574,6 @@ fn build_stateful_set(
])
.args(command_args(log_dir))
.add_container_port("http", HISTORY_UI_PORT.into())
.add_container_port("metrics", METRICS_PORT.into())
.add_env_vars(merged_env)
.add_volume_mounts(log_dir.volume_mounts())
.context(AddVolumeMountSnafu)?
Expand Down
46 changes: 26 additions & 20 deletions rust/operator-binary/src/spark_k8s_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use stackable_operator::{
core::{DeserializeGuard, error_boundary},
runtime::{controller::Action, reflector::ObjectRef},
},
kvp::Label,
logging::controller::ReconcilerError,
product_config_utils::ValidatedRoleConfigByPropertyKind,
product_logging::{
Expand Down Expand Up @@ -610,27 +611,32 @@ fn pod_template(
);
}

let mut omb = ObjectMetaBuilder::new();
omb.name(&container_name)
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
.ownerreference_from_resource(spark_application, None, None)
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(
spark_application
.build_recommended_labels(&spark_image.app_version_label, &container_name),
)
.context(MetadataBuildSnafu)?;

// Only the driver pod should be scraped by Prometheus
// because the executor metrics are also available via /metrics/executors/prometheus/
if role == SparkApplicationRole::Driver {
omb.with_label(Label::try_from(("prometheus.io/scrape", "true")).context(LabelBuildSnafu)?);
}

let mut pb = PodBuilder::new();
pb.metadata(
ObjectMetaBuilder::new()
.name(&container_name)
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
.ownerreference_from_resource(spark_application, None, None)
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(
spark_application
.build_recommended_labels(&spark_image.app_version_label, &container_name),
)
.context(MetadataBuildSnafu)?
.build(),
)
.add_container(cb.build())
.add_volumes(volumes.to_vec())
.context(AddVolumeSnafu)?
.security_context(security_context())
.image_pull_secrets_from_product_image(spark_image)
.affinity(&config.affinity);
pb.metadata(omb.build())
.add_container(cb.build())
.add_volumes(volumes.to_vec())
.context(AddVolumeSnafu)?
.security_context(security_context())
.image_pull_secrets_from_product_image(spark_image)
.affinity(&config.affinity);

let init_containers = init_containers(
spark_application,
Expand Down
14 changes: 14 additions & 0 deletions tests/templates/kuttl/spark-connect/30-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 300
commands:
- script: |
# This endpoint (/metrics/prometheus) is also used as liveliness probe
echo test Prometheus endpoint for driver metrics
DRIVER_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/prometheus | grep _driver_ | wc -l)
test 0 -lt "$DRIVER_METRIC_COUNT"

echo test Prometheus endpoint for executor metrics
EXECUTOR_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/executors/prometheus | grep _executor_ | wc -l)
test 0 -lt "$EXECUTOR_METRIC_COUNT"