Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
- Support experimental user-info-fetcher Entra backend to fetch user groups ([#712]).
- Add support for OPA `1.4.2` ([#723]).
- Add RBAC rule to helm template for automatic cluster domain detection ([#743]).
- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]).
- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]).

### Changed

Expand Down Expand Up @@ -49,6 +51,8 @@ All notable changes to this project will be documented in this file.
- The CLI argument `--kubernetes-node-name` or env variable `KUBERNETES_NODE_NAME` needs to be set. The helm-chart takes care of this.
- The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`,
so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#745]).
- The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there
purpose and to be consistent with other operators ([#748]).

### Fixed

Expand Down Expand Up @@ -77,6 +81,7 @@ All notable changes to this project will be documented in this file.
[#743]: https://github.com/stackabletech/opa-operator/pull/743
[#744]: https://github.com/stackabletech/opa-operator/pull/744
[#745]: https://github.com/stackabletech/opa-operator/pull/745
[#748]: https://github.com/stackabletech/opa-operator/pull/748

## [25.3.0] - 2025-03-21

Expand Down
148 changes: 113 additions & 35 deletions rust/operator-binary/src/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ use stackable_operator::{
core::{DeserializeGuard, error_boundary},
runtime::{controller::Action, reflector::ObjectRef},
},
kvp::{Label, LabelError, Labels, ObjectLabels},
kvp::{LabelError, Labels, ObjectLabels},
logging::controller::ReconcilerError,
memory::{BinaryMultiple, MemoryQuantity},
product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config},
Expand Down Expand Up @@ -91,6 +91,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active";
pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming";
pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp";
pub const BUNDLE_BUILDER_PORT: i32 = 3030;
pub const OPA_STACKABLE_SERVICE_NAME: &str = "stackable";

const CONFIG_VOLUME_NAME: &str = "config";
const CONFIG_DIR: &str = "/stackable/config";
Expand Down Expand Up @@ -185,6 +186,12 @@ pub enum Error {
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
},

#[snafu(display("failed to apply metrics Service for [{rolegroup}]"))]
ApplyRoleGroupMetricsService {
source: stackable_operator::cluster_resources::Error,
rolegroup: RoleGroupRef<v1alpha1::OpaCluster>,
},

#[snafu(display("failed to build ConfigMap for [{rolegroup}]"))]
BuildRoleGroupConfig {
source: stackable_operator::builder::configmap::Error,
Expand Down Expand Up @@ -337,19 +344,20 @@ pub struct OpaClusterConfigFile {
bundles: OpaClusterBundle,
#[serde(skip_serializing_if = "Option::is_none")]
decision_logs: Option<OpaClusterConfigDecisionLog>,
status: Option<OpaClusterConfigStatus>,
}

impl OpaClusterConfigFile {
pub fn new(decision_logging: Option<OpaClusterConfigDecisionLog>) -> Self {
Self {
services: vec![OpaClusterConfigService {
name: String::from("stackable"),
url: String::from("http://localhost:3030/opa/v1"),
name: OPA_STACKABLE_SERVICE_NAME.to_owned(),
url: "http://localhost:3030/opa/v1".to_owned(),
}],
bundles: OpaClusterBundle {
stackable: OpaClusterBundleConfig {
service: String::from("stackable"),
resource: String::from("opa/bundle.tar.gz"),
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
resource: "opa/bundle.tar.gz".to_owned(),
persist: true,
polling: OpaClusterBundleConfigPolling {
min_delay_seconds: 10,
Expand All @@ -358,6 +366,12 @@ impl OpaClusterConfigFile {
},
},
decision_logs: decision_logging,
// Enable more Prometheus metrics, such as bundle loads
// See https://www.openpolicyagent.org/docs/monitoring#status-metrics
status: Some(OpaClusterConfigStatus {
service: OPA_STACKABLE_SERVICE_NAME.to_owned(),
prometheus: true,
}),
}
}
}
Expand Down Expand Up @@ -392,6 +406,12 @@ pub struct OpaClusterConfigDecisionLog {
console: bool,
}

#[derive(Serialize, Deserialize)]
struct OpaClusterConfigStatus {
service: String,
prometheus: bool,
}

pub async fn reconcile_opa(
opa: Arc<DeserializeGuard<v1alpha1::OpaCluster>>,
ctx: Arc<Ctx>,
Expand Down Expand Up @@ -489,7 +509,10 @@ pub async fn reconcile_opa(
&rolegroup,
&merged_config,
)?;
let rg_service = build_rolegroup_service(opa, &resolved_product_image, &rolegroup)?;
let rg_service =
build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?;
let rg_metrics_service =
build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?;
let rg_daemonset = build_server_rolegroup_daemonset(
opa,
&resolved_product_image,
Expand All @@ -515,6 +538,12 @@ pub async fn reconcile_opa(
.with_context(|_| ApplyRoleGroupServiceSnafu {
rolegroup: rolegroup.clone(),
})?;
cluster_resources
.add(client, rg_metrics_service)
.await
.with_context(|_| ApplyRoleGroupServiceSnafu {
rolegroup: rolegroup.clone(),
})?;
ds_cond_builder.add(
cluster_resources
.add(client, rg_daemonset.clone())
Expand Down Expand Up @@ -632,17 +661,14 @@ pub fn build_server_role_service(
/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup
///
/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing.
fn build_rolegroup_service(
fn build_rolegroup_headless_service(
opa: &v1alpha1::OpaCluster,
resolved_product_image: &ResolvedProductImage,
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
) -> Result<Service> {
let prometheus_label =
Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?;

let metadata = ObjectMetaBuilder::new()
.name_and_namespace(opa)
.name(rolegroup.object_name())
.name(rolegroup.rolegroup_headless_service_name())
.ownerreference_from_resource(opa, None, Some(true))
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(build_recommended_labels(
Expand All @@ -652,19 +678,20 @@ fn build_rolegroup_service(
&rolegroup.role_group,
))
.context(ObjectMetaSnafu)?
.with_label(prometheus_label)
.build();

let service_selector_labels =
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
.context(BuildLabelSnafu)?;

let service_spec = ServiceSpec {
// Internal communication does not need to be exposed
// Currently we don't offer listener-exposition of OPA mostly due to security concerns.
// OPA is currently public within the Kubernetes (without authentication).
// Opening it up to outside of Kubernetes might worsen things.
// We are open to implement listener-integration, but this needs to be though through before
// implementing it.
// Note: We have kind of similar situations for HMS and Zookeeper, as the authentication
// options there are non-existent (mTLS still opens plain port) or suck (Kerberos).
type_: Some("ClusterIP".to_string()),
cluster_ip: Some("None".to_string()),
ports: Some(service_ports()),
selector: Some(service_selector_labels.into()),
ports: Some(data_service_ports()),
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
publish_not_ready_addresses: Some(true),
..ServiceSpec::default()
};
Expand All @@ -676,6 +703,55 @@ fn build_rolegroup_service(
})
}

/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the
/// prometheus.io/scrape label.
fn build_rolegroup_metrics_service(
opa: &v1alpha1::OpaCluster,
resolved_product_image: &ResolvedProductImage,
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
) -> Result<Service> {
let labels = Labels::try_from([("prometheus.io/scrape", "true")])
.expect("static Prometheus labels must be valid");

let metadata = ObjectMetaBuilder::new()
.name_and_namespace(opa)
.name(rolegroup.rolegroup_metrics_service_name())
.ownerreference_from_resource(opa, None, Some(true))
.context(ObjectMissingMetadataForOwnerRefSnafu)?
.with_recommended_labels(build_recommended_labels(
opa,
&resolved_product_image.app_version_label,
&rolegroup.role,
&rolegroup.role_group,
))
.context(ObjectMetaSnafu)?
.with_labels(labels)
.build();

let service_spec = ServiceSpec {
type_: Some("ClusterIP".to_string()),
cluster_ip: Some("None".to_string()),
ports: Some(vec![metrics_service_port()]),
selector: Some(role_group_selector_labels(opa, rolegroup)?.into()),
..ServiceSpec::default()
};

Ok(Service {
metadata,
spec: Some(service_spec),
status: None,
})
}

/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup.
fn role_group_selector_labels(
opa: &v1alpha1::OpaCluster,
rolegroup: &RoleGroupRef<v1alpha1::OpaCluster>,
) -> Result<Labels> {
Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group)
.context(BuildLabelSnafu)
}

/// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator
fn build_server_rolegroup_config_map(
opa: &v1alpha1::OpaCluster,
Expand Down Expand Up @@ -1387,22 +1463,24 @@ fn build_prepare_start_command(
prepare_container_args
}

fn service_ports() -> Vec<ServicePort> {
vec![
ServicePort {
name: Some(APP_PORT_NAME.to_string()),
port: APP_PORT.into(),
protocol: Some("TCP".to_string()),
..ServicePort::default()
},
ServicePort {
name: Some(METRICS_PORT_NAME.to_string()),
port: 9504, // Arbitrary port number, this is never actually used anywhere
protocol: Some("TCP".to_string()),
target_port: Some(IntOrString::String(APP_PORT_NAME.to_string())),
..ServicePort::default()
},
]
fn data_service_ports() -> Vec<ServicePort> {
// Currently only HTTP is exposed
vec![ServicePort {
name: Some(APP_PORT_NAME.to_string()),
port: APP_PORT.into(),
protocol: Some("TCP".to_string()),
..ServicePort::default()
}]
}

fn metrics_service_port() -> ServicePort {
ServicePort {
name: Some(METRICS_PORT_NAME.to_string()),
// The metrics are served on the same port as the HTTP traffic
port: APP_PORT.into(),
protocol: Some("TCP".to_string()),
..ServicePort::default()
}
}

/// Creates recommended `ObjectLabels` to be used in deployed resources
Expand Down
2 changes: 1 addition & 1 deletion tests/templates/kuttl/smoke/20-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ timeout: 300
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: test-regorule
name: test-opa
status:
readyReplicas: 1
replicas: 1
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: test-regorule
name: test-opa
labels:
app: test-regorule
app: test-opa
spec:
replicas: 1
selector:
matchLabels:
app: test-regorule
app: test-opa
template:
metadata:
labels:
app: test-regorule
app: test-opa
spec:
containers:
- name: test-regorule
- name: test-opa
image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev
stdin: true
tty: true
Expand Down
2 changes: 1 addition & 1 deletion tests/templates/kuttl/smoke/30-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ kind: TestAssert
metadata:
name: test-regorule
commands:
- script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test'
- script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-regorule.py -u 'http://test-opa:8081/v1/data/test'
5 changes: 5 additions & 0 deletions tests/templates/kuttl/smoke/30-prepare-test-opa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: kubectl cp -n $NAMESPACE ./30_test-regorule.py test-opa-0:/tmp
- script: kubectl cp -n $NAMESPACE ./30_test-metrics.py test-opa-0:/tmp
4 changes: 0 additions & 4 deletions tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml

This file was deleted.

12 changes: 12 additions & 0 deletions tests/templates/kuttl/smoke/30_test-metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
import requests

# metrics_url = "http://test-opa-server-default:9504/metrics"
metrics_url = "http://test-opa-server-default-metrics:8081/metrics"
# FIXME: Ideally this would be exposed via a metrics service (as the other operators do)
response = requests.get(metrics_url)

assert response.status_code == 200, "Metrics endpoint must return a 200 status code"
assert "bundle_loaded_counter" in response.text, f"Metric bundle_loaded_counter should exist in {metrics_url}"

print("Metrics test successful!")
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
and "hello" in response["result"]
and response["result"]["hello"]
):
print("Test successful!")
print("Regorule test successful!")
exit(0)
else:
print(
Expand All @@ -43,3 +43,5 @@
+ " - expected: {'result': {'hello': True}}"
)
exit(-1)

metrics = requests.get(f"{url}/metrics")
7 changes: 7 additions & 0 deletions tests/templates/kuttl/smoke/31-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
metadata:
name: test-metrics
commands:
- script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-metrics.py
Loading