2121from ..prometheus_utils import ClusterNotSpecifiedException , generate_prometheus_config
2222from .base_metric_service import MetricsService
2323
24+ PROM_REFRESH_CREDS_SEC = int (os .environ .get ("PROM_REFRESH_CREDS_SEC" , "600" )) # 10 minutes
25+
2426logger = logging .getLogger ("krr" )
2527
2628
@@ -105,30 +107,39 @@ def __init__(
105107 elif not settings .inside_cluster and self .api_client is not None :
106108 self .api_client .update_params_for_auth (headers , {}, ["BearerToken" ])
107109 self .prom_config = generate_prometheus_config (url = self .url , headers = headers , metrics_service = self )
108- self .prometheus = get_custom_prometheus_connect (self .prom_config )
110+ self .get_prometheus ()
111+
112+ def get_prometheus (self ):
113+ now = datetime .utcnow ()
114+ if (not self .prometheus
115+ or not self ._last_init_at
116+ or now - self ._last_init_at >= timedelta (seconds = PROM_REFRESH_CREDS_SEC )):
117+ self .prometheus = get_custom_prometheus_connect (self .prom_config )
118+ self ._last_init_at = now
119+ return self .prometheus
109120
110121 def check_connection (self ):
111122 """
112123 Checks the connection to Prometheus.
113124 Raises:
114125 PrometheusNotFound: If the connection to Prometheus cannot be established.
115126 """
116- self .prometheus .check_prometheus_connection ()
127+ self .get_prometheus () .check_prometheus_connection ()
117128
118129 @retry (wait = wait_random (min = 2 , max = 10 ), stop = stop_after_attempt (5 ))
119130 async def query (self , query : str ) -> dict :
120131 loop = asyncio .get_running_loop ()
121132 return await loop .run_in_executor (
122133 self .executor ,
123- lambda : self .prometheus .safe_custom_query (query = query )["result" ],
134+ lambda : self .get_prometheus () .safe_custom_query (query = query )["result" ],
124135 )
125136
126137 @retry (wait = wait_random (min = 2 , max = 10 ), stop = stop_after_attempt (5 ))
127138 async def query_range (self , query : str , start : datetime , end : datetime , step : timedelta ) -> dict :
128139 loop = asyncio .get_running_loop ()
129140 return await loop .run_in_executor (
130141 self .executor ,
131- lambda : self .prometheus .safe_custom_query_range (
142+ lambda : self .get_prometheus () .safe_custom_query_range (
132143 query = query , start_time = start , end_time = end , step = f"{ step .seconds } s"
133144 )["result" ],
134145 )
@@ -155,7 +166,7 @@ def validate_cluster_name(self):
155166
156167 def get_cluster_names (self ) -> Optional [List [str ]]:
157168 try :
158- return self .prometheus .get_label_values (label_name = settings .prometheus_label )
169+ return self .get_prometheus () .get_label_values (label_name = settings .prometheus_label )
159170 except PrometheusApiClientException :
160171 logger .error ("Labels api not present on prometheus client" )
161172 return []
@@ -194,7 +205,7 @@ async def gather_data(
194205 """
195206 logger .debug (f"Gathering { LoaderClass .__name__ } metric for { object } " )
196207 try :
197- metric_loader = LoaderClass (self .prometheus , self .name (), self .executor )
208+ metric_loader = LoaderClass (self .get_prometheus () , self .name (), self .executor )
198209 data = await metric_loader .load_data (object , period , step )
199210 except Exception :
200211 logger .exception ("Failed to gather resource history data for %s" , object )
0 commit comments