Skip to content

Commit 7274380

Browse files
authored
prevent failure of calculatings results for a single workload from failing entire scan (#392)
1 parent 89d1b9f commit 7274380

File tree

1 file changed

+36
-32
lines changed

1 file changed

+36
-32
lines changed

robusta_krr/core/runner.py

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -176,39 +176,43 @@ def _format_result(self, result: RunResult) -> RunResult:
176176
}
177177

178178
async def _calculate_object_recommendations(self, object: K8sObjectData) -> Optional[RunResult]:
179-
prometheus_loader = self._get_prometheus_loader(object.cluster)
180-
181-
if prometheus_loader is None:
182-
return None
183-
184-
object.pods = await prometheus_loader.load_pods(object, self._strategy.settings.history_timedelta)
185-
if object.pods == []:
186-
# Fallback to Kubernetes API
187-
object.pods = await self._k8s_loader.load_pods(object)
188-
189-
# NOTE: Kubernetes API returned pods, but Prometheus did not
190-
# This might happen with fast executing jobs
191-
if object.pods != []:
192-
object.add_warning("NoPrometheusPods")
193-
logger.warning(
194-
f"Was not able to load any pods for {object} from Prometheus. "
195-
"Loaded pods from Kubernetes API instead."
196-
)
197-
198-
metrics = await prometheus_loader.gather_data(
199-
object,
200-
self._strategy,
201-
self._strategy.settings.history_timedelta,
202-
step=self._strategy.settings.timeframe_timedelta,
203-
)
179+
try:
180+
prometheus_loader = self._get_prometheus_loader(object.cluster)
181+
182+
if prometheus_loader is None:
183+
return None
184+
185+
object.pods = await prometheus_loader.load_pods(object, self._strategy.settings.history_timedelta)
186+
if object.pods == []:
187+
# Fallback to Kubernetes API
188+
object.pods = await self._k8s_loader.load_pods(object)
189+
190+
# NOTE: Kubernetes API returned pods, but Prometheus did not
191+
# This might happen with fast executing jobs
192+
if object.pods != []:
193+
object.add_warning("NoPrometheusPods")
194+
logger.warning(
195+
f"Was not able to load any pods for {object} from Prometheus. "
196+
"Loaded pods from Kubernetes API instead."
197+
)
198+
199+
metrics = await prometheus_loader.gather_data(
200+
object,
201+
self._strategy,
202+
self._strategy.settings.history_timedelta,
203+
step=self._strategy.settings.timeframe_timedelta,
204+
)
204205

205-
# NOTE: We run this in a threadpool as the strategy calculation might be CPU intensive
206-
# But keep in mind that numpy calcluations will not block the GIL
207-
loop = asyncio.get_running_loop()
208-
result = await loop.run_in_executor(self._executor, self._strategy.run, metrics, object)
206+
# NOTE: We run this in a threadpool as the strategy calculation might be CPU intensive
207+
# But keep in mind that numpy calcluations will not block the GIL
208+
loop = asyncio.get_running_loop()
209+
result = await loop.run_in_executor(self._executor, self._strategy.run, metrics, object)
209210

210-
logger.info(f"Calculated recommendations for {object} (using {len(metrics)} metrics)")
211-
return self._format_result(result)
211+
logger.info(f"Calculated recommendations for {object} (using {len(metrics)} metrics)")
212+
return self._format_result(result)
213+
except Exception as e:
214+
logger.error(f"An error occurred while calculating recommendations for {object}: {e}")
215+
return None
212216

213217
async def _check_data_availability(self, cluster: Optional[str]) -> None:
214218
prometheus_loader = self._get_prometheus_loader(cluster)
@@ -308,7 +312,7 @@ async def _collect_result(self) -> Result:
308312
raise CriticalRunnerException("No successful scans were made. Check the logs for more information.")
309313

310314
return Result(
311-
scans=scans,
315+
scans=successful_scans,
312316
description=f"[b]{self._strategy.display_name.title()} Strategy[/b]\n\n{self._strategy.description}",
313317
strategy=StrategyData(
314318
name=str(self._strategy).lower(),

0 commit comments

Comments
 (0)