Skip to content

Commit 73ffb27

Browse files
feat!: Add dedicated -metrics service (#748)
* feat!: Add dedicated -metrics service * changelog * improve doc comment * linter * linter * Add container port * fix port problem * fix tests und role service name * fix linter * Update rust/operator-binary/src/controller.rs Co-authored-by: Malte Sander <[email protected]> * changelog * changelog * changelog * Update CHANGELOG.md Co-authored-by: Malte Sander <[email protected]> * fix tests * ruff ruff --------- Co-authored-by: Malte Sander <[email protected]> Co-authored-by: Malte Sander <[email protected]>
1 parent bc39656 commit 73ffb27

File tree

15 files changed

+193
-80
lines changed

15 files changed

+193
-80
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]).
10+
- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]).
11+
12+
### Changed
13+
14+
- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate their
15+
purpose and to be consistent with other operators ([#748]).
16+
- BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]).
17+
18+
[#748]: https://github.com/stackabletech/opa-operator/pull/748
19+
720
## [25.7.0] - 2025-07-23
821

922
## [25.7.0-rc1] - 2025-07-18

rust/operator-binary/src/controller.rs

Lines changed: 119 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ use stackable_operator::{
5050
core::{DeserializeGuard, error_boundary},
5151
runtime::{controller::Action, reflector::ObjectRef},
5252
},
53-
kvp::{Label, LabelError, Labels, ObjectLabels},
53+
kvp::{LabelError, Labels, ObjectLabels},
5454
logging::controller::ReconcilerError,
5555
memory::{BinaryMultiple, MemoryQuantity},
5656
product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config},
@@ -91,6 +91,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active";
9191
pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming";
9292
pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp";
9393
pub const BUNDLE_BUILDER_PORT: i32 = 3030;
94+
pub const OPA_STACKABLE_SERVICE_NAME: &str = "stackable";
9495

9596
const CONFIG_VOLUME_NAME: &str = "config";
9697
const CONFIG_DIR: &str = "/stackable/config";
@@ -185,6 +186,12 @@ pub enum Error {
185186
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
186187
},
187188

189+
#[snafu(display("failed to apply metrics Service for [{rolegroup}]"))]
190+
ApplyRoleGroupMetricsService {
191+
source: stackable_operator::cluster_resources::Error,
192+
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
193+
},
194+
188195
#[snafu(display("failed to build ConfigMap for [{rolegroup}]"))]
189196
BuildRoleGroupConfig {
190197
source: stackable_operator::builder::configmap::Error,
@@ -337,19 +344,20 @@ pub struct OpaClusterConfigFile {
337344
bundles: OpaClusterBundle,
338345
#[serde(skip_serializing_if = "Option::is_none")]
339346
decision_logs: Option<OpaClusterConfigDecisionLog>,
347+
status: Option<OpaClusterConfigStatus>,
340348
}
341349

342350
impl OpaClusterConfigFile {
343351
pub fn new(decision_logging: Option<OpaClusterConfigDecisionLog>) -> Self {
344352
Self {
345353
services: vec![OpaClusterConfigService {
346-
name: String::from("stackable"),
347-
url: String::from("http://localhost:3030/opa/v1"),
354+
name: OPA_STACKABLE_SERVICE_NAME.to_owned(),
355+
url: "http://localhost:3030/opa/v1".to_owned(),
348356
}],
349357
bundles: OpaClusterBundle {
350358
stackable: OpaClusterBundleConfig {
351-
service: String::from("stackable"),
352-
resource: String::from("opa/bundle.tar.gz"),
359+
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
360+
resource: "opa/bundle.tar.gz".to_owned(),
353361
persist: true,
354362
polling: OpaClusterBundleConfigPolling {
355363
min_delay_seconds: 10,
@@ -358,6 +366,12 @@ impl OpaClusterConfigFile {
358366
},
359367
},
360368
decision_logs: decision_logging,
369+
// Enable more Prometheus metrics, such as bundle loads
370+
// See https://www.openpolicyagent.org/docs/monitoring#status-metrics
371+
status: Some(OpaClusterConfigStatus {
372+
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
373+
prometheus: true,
374+
}),
361375
}
362376
}
363377
}
@@ -392,6 +406,12 @@ pub struct OpaClusterConfigDecisionLog {
392406
console: bool,
393407
}
394408

409+
#[derive(Serialize, Deserialize)]
410+
struct OpaClusterConfigStatus {
411+
service: String,
412+
prometheus: bool,
413+
}
414+
395415
pub async fn reconcile_opa(
396416
opa: Arc<DeserializeGuard<v1alpha1::OpaCluster>>,
397417
ctx: Arc<Ctx>,
@@ -489,7 +509,10 @@ pub async fn reconcile_opa(
489509
&rolegroup,
490510
&merged_config,
491511
)?;
492-
let rg_service = build_rolegroup_service(opa, &resolved_product_image, &rolegroup)?;
512+
let rg_service =
513+
build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?;
514+
let rg_metrics_service =
515+
build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?;
493516
let rg_daemonset = build_server_rolegroup_daemonset(
494517
opa,
495518
&resolved_product_image,
@@ -515,6 +538,12 @@ pub async fn reconcile_opa(
515538
.with_context(|_| ApplyRoleGroupServiceSnafu {
516539
rolegroup: rolegroup.clone(),
517540
})?;
541+
cluster_resources
542+
.add(client, rg_metrics_service)
543+
.await
544+
.with_context(|_| ApplyRoleGroupServiceSnafu {
545+
rolegroup: rolegroup.clone(),
546+
})?;
518547
ds_cond_builder.add(
519548
cluster_resources
520549
.add(client, rg_daemonset.clone())
@@ -611,12 +640,7 @@ pub fn build_server_role_service(
611640

612641
let service_spec = ServiceSpec {
613642
type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()),
614-
ports: Some(vec![ServicePort {
615-
name: Some(APP_PORT_NAME.to_string()),
616-
port: APP_PORT.into(),
617-
protocol: Some("TCP".to_string()),
618-
..ServicePort::default()
619-
}]),
643+
ports: Some(data_service_ports()),
620644
selector: Some(service_selector_labels.into()),
621645
internal_traffic_policy: Some("Local".to_string()),
622646
..ServiceSpec::default()
@@ -632,17 +656,14 @@ pub fn build_server_role_service(
632656
/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup
633657
///
634658
/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing.
635-
fn build_rolegroup_service(
659+
fn build_rolegroup_headless_service(
636660
opa: &v1alpha1::OpaCluster,
637661
resolved_product_image: &ResolvedProductImage,
638662
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
639663
) -> Result<Service> {
640-
let prometheus_label =
641-
Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?;
642-
643664
let metadata = ObjectMetaBuilder::new()
644665
.name_and_namespace(opa)
645-
.name(rolegroup.object_name())
666+
.name(rolegroup.rolegroup_headless_service_name())
646667
.ownerreference_from_resource(opa, None, Some(true))
647668
.context(ObjectMissingMetadataForOwnerRefSnafu)?
648669
.with_recommended_labels(build_recommended_labels(
@@ -652,19 +673,20 @@ fn build_rolegroup_service(
652673
&rolegroup.role_group,
653674
))
654675
.context(ObjectMetaSnafu)?
655-
.with_label(prometheus_label)
656676
.build();
657677

658-
let service_selector_labels =
659-
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
660-
.context(BuildLabelSnafu)?;
661-
662678
let service_spec = ServiceSpec {
663-
// Internal communication does not need to be exposed
679+
// Currently we don't offer listener-exposition of OPA mostly due to security concerns.
680+
// OPA is currently public within the Kubernetes (without authentication).
681+
// Opening it up to outside of Kubernetes might worsen things.
682+
// We are open to implement listener-integration, but this needs to be thought through before
683+
// implementing it.
684+
// Note: We have kind of similar situations for HMS and Zookeeper, as the authentication
685+
// options there are non-existent (mTLS still opens plain port) or suck (Kerberos).
664686
type_: Some("ClusterIP".to_string()),
665687
cluster_ip: Some("None".to_string()),
666-
ports: Some(service_ports()),
667-
selector: Some(service_selector_labels.into()),
688+
ports: Some(data_service_ports()),
689+
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
668690
publish_not_ready_addresses: Some(true),
669691
..ServiceSpec::default()
670692
};
@@ -676,6 +698,55 @@ fn build_rolegroup_service(
676698
})
677699
}
678700

701+
/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the
702+
/// prometheus.io/scrape label.
703+
fn build_rolegroup_metrics_service(
704+
opa: &v1alpha1::OpaCluster,
705+
resolved_product_image: &ResolvedProductImage,
706+
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
707+
) -> Result<Service> {
708+
let labels = Labels::try_from([("prometheus.io/scrape", "true")])
709+
.expect("static Prometheus labels must be valid");
710+
711+
let metadata = ObjectMetaBuilder::new()
712+
.name_and_namespace(opa)
713+
.name(rolegroup.rolegroup_metrics_service_name())
714+
.ownerreference_from_resource(opa, None, Some(true))
715+
.context(ObjectMissingMetadataForOwnerRefSnafu)?
716+
.with_recommended_labels(build_recommended_labels(
717+
opa,
718+
&resolved_product_image.app_version_label,
719+
&rolegroup.role,
720+
&rolegroup.role_group,
721+
))
722+
.context(ObjectMetaSnafu)?
723+
.with_labels(labels)
724+
.build();
725+
726+
let service_spec = ServiceSpec {
727+
type_: Some("ClusterIP".to_string()),
728+
cluster_ip: Some("None".to_string()),
729+
ports: Some(vec![metrics_service_port()]),
730+
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
731+
..ServiceSpec::default()
732+
};
733+
734+
Ok(Service {
735+
metadata,
736+
spec: Some(service_spec),
737+
status: None,
738+
})
739+
}
740+
741+
/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup.
742+
fn role_group_selector_labels(
743+
opa: &v1alpha1::OpaCluster,
744+
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
745+
) -> Result<Labels> {
746+
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
747+
.context(BuildLabelSnafu)
748+
}
749+
679750
/// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator
680751
fn build_server_rolegroup_config_map(
681752
opa: &v1alpha1::OpaCluster,
@@ -904,6 +975,11 @@ fn build_server_rolegroup_daemonset(
904975
format!("{STACKABLE_LOG_DIR}/containerdebug"),
905976
)
906977
.add_container_port(APP_PORT_NAME, APP_PORT.into())
978+
// If we also add a container port "metrics" pointing to the same port number, we get a
979+
//
980+
// .spec.template.spec.containers[name="opa"].ports: duplicate entries for key [containerPort=8081,protocol="TCP"]
981+
//
982+
// So we don't do that
907983
.add_volume_mount(CONFIG_VOLUME_NAME, CONFIG_DIR)
908984
.context(AddVolumeMountSnafu)?
909985
.add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR)
@@ -1387,22 +1463,24 @@ fn build_prepare_start_command(
13871463
prepare_container_args
13881464
}
13891465

1390-
fn service_ports() -> Vec<ServicePort> {
1391-
vec![
1392-
ServicePort {
1393-
name: Some(APP_PORT_NAME.to_string()),
1394-
port: APP_PORT.into(),
1395-
protocol: Some("TCP".to_string()),
1396-
..ServicePort::default()
1397-
},
1398-
ServicePort {
1399-
name: Some(METRICS_PORT_NAME.to_string()),
1400-
port: 9504, // Arbitrary port number, this is never actually used anywhere
1401-
protocol: Some("TCP".to_string()),
1402-
target_port: Some(IntOrString::String(APP_PORT_NAME.to_string())),
1403-
..ServicePort::default()
1404-
},
1405-
]
1466+
fn data_service_ports() -> Vec<ServicePort> {
1467+
// Currently only HTTP is exposed
1468+
vec![ServicePort {
1469+
name: Some(APP_PORT_NAME.to_string()),
1470+
port: APP_PORT.into(),
1471+
protocol: Some("TCP".to_string()),
1472+
..ServicePort::default()
1473+
}]
1474+
}
1475+
1476+
fn metrics_service_port() -> ServicePort {
1477+
ServicePort {
1478+
name: Some(METRICS_PORT_NAME.to_string()),
1479+
// The metrics are served on the same port as the HTTP traffic
1480+
port: APP_PORT.into(),
1481+
protocol: Some("TCP".to_string()),
1482+
..ServicePort::default()
1483+
}
14061484
}
14071485

14081486
/// Creates recommended `ObjectLabels` to be used in deployed resources

rust/operator-binary/src/crd/mod.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use stackable_operator::{
1717
merge::Merge,
1818
},
1919
k8s_openapi::apimachinery::pkg::api::resource::Quantity,
20-
kube::CustomResource,
20+
kube::{CustomResource, ResourceExt},
2121
product_config_utils::Configuration,
2222
product_logging::{self, spec::Logging},
2323
role_utils::{
@@ -326,7 +326,11 @@ impl v1alpha1::OpaCluster {
326326

327327
/// The name of the role-level load-balanced Kubernetes `Service`
328328
pub fn server_role_service_name(&self) -> Option<String> {
329-
self.metadata.name.clone()
329+
Some(format!(
330+
"{cluster_name}-{role}",
331+
cluster_name = self.name_any(),
332+
role = v1alpha1::OpaRole::Server
333+
))
330334
}
331335

332336
/// The fully-qualified domain name of the role-level load-balanced Kubernetes `Service`

tests/templates/kuttl/aas-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

tests/templates/kuttl/ad-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

tests/templates/kuttl/keycloak-user-info/30-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ kind: TestAssert
44
metadata:
55
name: test-regorule
66
commands:
7-
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
7+
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test'

0 commit comments

Comments
 (0)