@@ -50,7 +50,7 @@ use stackable_operator::{
5050 core:: { DeserializeGuard , error_boundary} ,
5151 runtime:: { controller:: Action , reflector:: ObjectRef } ,
5252 } ,
53- kvp:: { Label , LabelError , Labels , ObjectLabels } ,
53+ kvp:: { LabelError , Labels , ObjectLabels } ,
5454 logging:: controller:: ReconcilerError ,
5555 memory:: { BinaryMultiple , MemoryQuantity } ,
5656 product_config_utils:: { transform_all_roles_to_config, validate_all_roles_and_groups_config} ,
@@ -91,6 +91,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active";
9191pub const BUNDLES_INCOMING_DIR : & str = "/bundles/incoming" ;
9292pub const BUNDLES_TMP_DIR : & str = "/bundles/tmp" ;
9393pub const BUNDLE_BUILDER_PORT : i32 = 3030 ;
94+ pub const OPA_STACKABLE_SERVICE_NAME : & str = "stackable" ;
9495
9596const CONFIG_VOLUME_NAME : & str = "config" ;
9697const CONFIG_DIR : & str = "/stackable/config" ;
@@ -185,6 +186,12 @@ pub enum Error {
185186 rolegroup : RoleGroupRef < v1alpha1:: OpaCluster > ,
186187 } ,
187188
189+ #[ snafu( display( "failed to apply metrics Service for [{rolegroup}]" ) ) ]
190+ ApplyRoleGroupMetricsService {
191+ source : stackable_operator:: cluster_resources:: Error ,
192+ rolegroup : RoleGroupRef < v1alpha1:: OpaCluster > ,
193+ } ,
194+
188195 #[ snafu( display( "failed to build ConfigMap for [{rolegroup}]" ) ) ]
189196 BuildRoleGroupConfig {
190197 source : stackable_operator:: builder:: configmap:: Error ,
@@ -337,19 +344,20 @@ pub struct OpaClusterConfigFile {
337344 bundles : OpaClusterBundle ,
338345 #[ serde( skip_serializing_if = "Option::is_none" ) ]
339346 decision_logs : Option < OpaClusterConfigDecisionLog > ,
347+ status : Option < OpaClusterConfigStatus > ,
340348}
341349
342350impl OpaClusterConfigFile {
343351 pub fn new ( decision_logging : Option < OpaClusterConfigDecisionLog > ) -> Self {
344352 Self {
345353 services : vec ! [ OpaClusterConfigService {
346- name: String :: from ( "stackable" ) ,
347- url: String :: from ( "http://localhost:3030/opa/v1" ) ,
354+ name: OPA_STACKABLE_SERVICE_NAME . to_owned ( ) ,
355+ url: "http://localhost:3030/opa/v1" . to_owned ( ) ,
348356 } ] ,
349357 bundles : OpaClusterBundle {
350358 stackable : OpaClusterBundleConfig {
351- service : String :: from ( "stackable" ) ,
352- resource : String :: from ( "opa/bundle.tar.gz" ) ,
359+ service : OPA_STACKABLE_SERVICE_NAME . to_owned ( ) ,
360+ resource : "opa/bundle.tar.gz" . to_owned ( ) ,
353361 persist : true ,
354362 polling : OpaClusterBundleConfigPolling {
355363 min_delay_seconds : 10 ,
@@ -358,6 +366,12 @@ impl OpaClusterConfigFile {
358366 } ,
359367 } ,
360368 decision_logs : decision_logging,
369+ // Enable more Prometheus metrics, such as bundle loads
370+ // See https://www.openpolicyagent.org/docs/monitoring#status-metrics
371+ status : Some ( OpaClusterConfigStatus {
372+ service : OPA_STACKABLE_SERVICE_NAME . to_owned ( ) ,
373+ prometheus : true ,
374+ } ) ,
361375 }
362376 }
363377}
@@ -392,6 +406,12 @@ pub struct OpaClusterConfigDecisionLog {
392406 console : bool ,
393407}
394408
409+ #[ derive( Serialize , Deserialize ) ]
410+ struct OpaClusterConfigStatus {
411+ service : String ,
412+ prometheus : bool ,
413+ }
414+
395415pub async fn reconcile_opa (
396416 opa : Arc < DeserializeGuard < v1alpha1:: OpaCluster > > ,
397417 ctx : Arc < Ctx > ,
@@ -489,7 +509,10 @@ pub async fn reconcile_opa(
489509 & rolegroup,
490510 & merged_config,
491511 ) ?;
492- let rg_service = build_rolegroup_service ( opa, & resolved_product_image, & rolegroup) ?;
512+ let rg_service =
513+ build_rolegroup_headless_service ( opa, & resolved_product_image, & rolegroup) ?;
514+ let rg_metrics_service =
515+ build_rolegroup_metrics_service ( opa, & resolved_product_image, & rolegroup) ?;
493516 let rg_daemonset = build_server_rolegroup_daemonset (
494517 opa,
495518 & resolved_product_image,
@@ -515,6 +538,12 @@ pub async fn reconcile_opa(
515538 . with_context ( |_| ApplyRoleGroupServiceSnafu {
516539 rolegroup : rolegroup. clone ( ) ,
517540 } ) ?;
541+ cluster_resources
542+ . add ( client, rg_metrics_service)
543+ . await
544+ . with_context ( |_| ApplyRoleGroupServiceSnafu {
545+ rolegroup : rolegroup. clone ( ) ,
546+ } ) ?;
518547 ds_cond_builder. add (
519548 cluster_resources
520549 . add ( client, rg_daemonset. clone ( ) )
@@ -611,12 +640,7 @@ pub fn build_server_role_service(
611640
612641 let service_spec = ServiceSpec {
613642 type_ : Some ( opa. spec . cluster_config . listener_class . k8s_service_type ( ) ) ,
614- ports : Some ( vec ! [ ServicePort {
615- name: Some ( APP_PORT_NAME . to_string( ) ) ,
616- port: APP_PORT . into( ) ,
617- protocol: Some ( "TCP" . to_string( ) ) ,
618- ..ServicePort :: default ( )
619- } ] ) ,
643+ ports : Some ( data_service_ports ( ) ) ,
620644 selector : Some ( service_selector_labels. into ( ) ) ,
621645 internal_traffic_policy : Some ( "Local" . to_string ( ) ) ,
622646 ..ServiceSpec :: default ( )
@@ -632,17 +656,14 @@ pub fn build_server_role_service(
632656/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup
633657///
634658/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing.
635- fn build_rolegroup_service (
659+ fn build_rolegroup_headless_service (
636660 opa : & v1alpha1:: OpaCluster ,
637661 resolved_product_image : & ResolvedProductImage ,
638662 rolegroup : & RoleGroupRef < v1alpha1:: OpaCluster > ,
639663) -> Result < Service > {
640- let prometheus_label =
641- Label :: try_from ( ( "prometheus.io/scrape" , "true" ) ) . context ( BuildLabelSnafu ) ?;
642-
643664 let metadata = ObjectMetaBuilder :: new ( )
644665 . name_and_namespace ( opa)
645- . name ( rolegroup. object_name ( ) )
666+ . name ( rolegroup. rolegroup_headless_service_name ( ) )
646667 . ownerreference_from_resource ( opa, None , Some ( true ) )
647668 . context ( ObjectMissingMetadataForOwnerRefSnafu ) ?
648669 . with_recommended_labels ( build_recommended_labels (
@@ -652,19 +673,20 @@ fn build_rolegroup_service(
652673 & rolegroup. role_group ,
653674 ) )
654675 . context ( ObjectMetaSnafu ) ?
655- . with_label ( prometheus_label)
656676 . build ( ) ;
657677
658- let service_selector_labels =
659- Labels :: role_group_selector ( opa, APP_NAME , & rolegroup. role , & rolegroup. role_group )
660- . context ( BuildLabelSnafu ) ?;
661-
662678 let service_spec = ServiceSpec {
663- // Internal communication does not need to be exposed
679+ // Currently we don't offer listener-exposition of OPA mostly due to security concerns.
680+ // OPA is currently public within the Kubernetes (without authentication).
681+ // Opening it up to outside of Kubernetes might worsen things.
682+ // We are open to implement listener-integration, but this needs to be thought through before
683+ // implementing it.
684+ // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication
685+ // options there are non-existent (mTLS still opens plain port) or suck (Kerberos).
664686 type_ : Some ( "ClusterIP" . to_string ( ) ) ,
665687 cluster_ip : Some ( "None" . to_string ( ) ) ,
666- ports : Some ( service_ports ( ) ) ,
667- selector : Some ( service_selector_labels . into ( ) ) ,
688+ ports : Some ( data_service_ports ( ) ) ,
689+ selector : Some ( role_group_selector_labels ( opa , rolegroup ) ? . into ( ) ) ,
668690 publish_not_ready_addresses : Some ( true ) ,
669691 ..ServiceSpec :: default ( )
670692 } ;
@@ -676,6 +698,55 @@ fn build_rolegroup_service(
676698 } )
677699}
678700
701+ /// The rolegroup metrics [`Service`] is a service that exposes metrics and has the
702+ /// prometheus.io/scrape label.
703+ fn build_rolegroup_metrics_service (
704+ opa : & v1alpha1:: OpaCluster ,
705+ resolved_product_image : & ResolvedProductImage ,
706+ rolegroup : & RoleGroupRef < v1alpha1:: OpaCluster > ,
707+ ) -> Result < Service > {
708+ let labels = Labels :: try_from ( [ ( "prometheus.io/scrape" , "true" ) ] )
709+ . expect ( "static Prometheus labels must be valid" ) ;
710+
711+ let metadata = ObjectMetaBuilder :: new ( )
712+ . name_and_namespace ( opa)
713+ . name ( rolegroup. rolegroup_metrics_service_name ( ) )
714+ . ownerreference_from_resource ( opa, None , Some ( true ) )
715+ . context ( ObjectMissingMetadataForOwnerRefSnafu ) ?
716+ . with_recommended_labels ( build_recommended_labels (
717+ opa,
718+ & resolved_product_image. app_version_label ,
719+ & rolegroup. role ,
720+ & rolegroup. role_group ,
721+ ) )
722+ . context ( ObjectMetaSnafu ) ?
723+ . with_labels ( labels)
724+ . build ( ) ;
725+
726+ let service_spec = ServiceSpec {
727+ type_ : Some ( "ClusterIP" . to_string ( ) ) ,
728+ cluster_ip : Some ( "None" . to_string ( ) ) ,
729+ ports : Some ( vec ! [ metrics_service_port( ) ] ) ,
730+ selector : Some ( role_group_selector_labels ( opa, rolegroup) ?. into ( ) ) ,
731+ ..ServiceSpec :: default ( )
732+ } ;
733+
734+ Ok ( Service {
735+ metadata,
736+ spec : Some ( service_spec) ,
737+ status : None ,
738+ } )
739+ }
740+
741+ /// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup.
742+ fn role_group_selector_labels (
743+ opa : & v1alpha1:: OpaCluster ,
744+ rolegroup : & RoleGroupRef < v1alpha1:: OpaCluster > ,
745+ ) -> Result < Labels > {
746+ Labels :: role_group_selector ( opa, APP_NAME , & rolegroup. role , & rolegroup. role_group )
747+ . context ( BuildLabelSnafu )
748+ }
749+
679750/// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator
680751fn build_server_rolegroup_config_map (
681752 opa : & v1alpha1:: OpaCluster ,
@@ -904,6 +975,11 @@ fn build_server_rolegroup_daemonset(
904975 format ! ( "{STACKABLE_LOG_DIR}/containerdebug" ) ,
905976 )
906977 . add_container_port ( APP_PORT_NAME , APP_PORT . into ( ) )
978+ // If we also add a container port "metrics" pointing to the same port number, we get a
979+ //
980+ // .spec.template.spec.containers[name="opa"].ports: duplicate entries for key [containerPort=8081,protocol="TCP"]
981+ //
982+ // So we don't do that
907983 . add_volume_mount ( CONFIG_VOLUME_NAME , CONFIG_DIR )
908984 . context ( AddVolumeMountSnafu ) ?
909985 . add_volume_mount ( LOG_VOLUME_NAME , STACKABLE_LOG_DIR )
@@ -1387,22 +1463,24 @@ fn build_prepare_start_command(
13871463 prepare_container_args
13881464}
13891465
1390- fn service_ports ( ) -> Vec < ServicePort > {
1391- vec ! [
1392- ServicePort {
1393- name: Some ( APP_PORT_NAME . to_string( ) ) ,
1394- port: APP_PORT . into( ) ,
1395- protocol: Some ( "TCP" . to_string( ) ) ,
1396- ..ServicePort :: default ( )
1397- } ,
1398- ServicePort {
1399- name: Some ( METRICS_PORT_NAME . to_string( ) ) ,
1400- port: 9504 , // Arbitrary port number, this is never actually used anywhere
1401- protocol: Some ( "TCP" . to_string( ) ) ,
1402- target_port: Some ( IntOrString :: String ( APP_PORT_NAME . to_string( ) ) ) ,
1403- ..ServicePort :: default ( )
1404- } ,
1405- ]
1466+ fn data_service_ports ( ) -> Vec < ServicePort > {
1467+ // Currently only HTTP is exposed
1468+ vec ! [ ServicePort {
1469+ name: Some ( APP_PORT_NAME . to_string( ) ) ,
1470+ port: APP_PORT . into( ) ,
1471+ protocol: Some ( "TCP" . to_string( ) ) ,
1472+ ..ServicePort :: default ( )
1473+ } ]
1474+ }
1475+
1476+ fn metrics_service_port ( ) -> ServicePort {
1477+ ServicePort {
1478+ name : Some ( METRICS_PORT_NAME . to_string ( ) ) ,
1479+ // The metrics are served on the same port as the HTTP traffic
1480+ port : APP_PORT . into ( ) ,
1481+ protocol : Some ( "TCP" . to_string ( ) ) ,
1482+ ..ServicePort :: default ( )
1483+ }
14061484}
14071485
14081486/// Creates recommended `ObjectLabels` to be used in deployed resources
0 commit comments