prevent failure of calculatings results for a single workload from failing entire scan (#392)

aantn · web-flow · commit 7274380cf0d1 · 2025-01-16T13:04:40.000+02:00
diff --git a/robusta_krr/core/runner.py b/robusta_krr/core/runner.py
@@ -176,39 +176,43 @@ def _format_result(self, result: RunResult) -> RunResult:
         }
 
     async def _calculate_object_recommendations(self, object: K8sObjectData) -> Optional[RunResult]:
-        prometheus_loader = self._get_prometheus_loader(object.cluster)
-
-        if prometheus_loader is None:
-            return None
-
-        object.pods = await prometheus_loader.load_pods(object, self._strategy.settings.history_timedelta)
-        if object.pods == []:
-            # Fallback to Kubernetes API
-            object.pods = await self._k8s_loader.load_pods(object)
-
-            # NOTE: Kubernetes API returned pods, but Prometheus did not
-            # This might happen with fast executing jobs
-            if object.pods != []:
-                object.add_warning("NoPrometheusPods")
-                logger.warning(
-                    f"Was not able to load any pods for {object} from Prometheus. "
-                    "Loaded pods from Kubernetes API instead."
-                )
-
-        metrics = await prometheus_loader.gather_data(
-            object,
-            self._strategy,
-            self._strategy.settings.history_timedelta,
-            step=self._strategy.settings.timeframe_timedelta,
-        )
+        try:
+            prometheus_loader = self._get_prometheus_loader(object.cluster)
+
+            if prometheus_loader is None:
+                return None
+
+            object.pods = await prometheus_loader.load_pods(object, self._strategy.settings.history_timedelta)
+            if object.pods == []:
+                # Fallback to Kubernetes API
+                object.pods = await self._k8s_loader.load_pods(object)
+
+                # NOTE: Kubernetes API returned pods, but Prometheus did not
+                # This might happen with fast executing jobs
+                if object.pods != []:
+                    object.add_warning("NoPrometheusPods")
+                    logger.warning(
+                        f"Was not able to load any pods for {object} from Prometheus. "
+                        "Loaded pods from Kubernetes API instead."
+                    )
+
+            metrics = await prometheus_loader.gather_data(
+                object,
+                self._strategy,
+                self._strategy.settings.history_timedelta,
+                step=self._strategy.settings.timeframe_timedelta,
+            )
 
-        # NOTE: We run this in a threadpool as the strategy calculation might be CPU intensive
-        # But keep in mind that numpy calcluations will not block the GIL
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(self._executor, self._strategy.run, metrics, object)
+            # NOTE: We run this in a threadpool as the strategy calculation might be CPU intensive
+            # But keep in mind that numpy calcluations will not block the GIL
+            loop = asyncio.get_running_loop()
+            result = await loop.run_in_executor(self._executor, self._strategy.run, metrics, object)
 
-        logger.info(f"Calculated recommendations for {object} (using {len(metrics)} metrics)")
-        return self._format_result(result)
+            logger.info(f"Calculated recommendations for {object} (using {len(metrics)} metrics)")
+            return self._format_result(result)
+        except Exception as e:
+            logger.error(f"An error occurred while calculating recommendations for {object}: {e}")
+            return None
 
     async def _check_data_availability(self, cluster: Optional[str]) -> None:
         prometheus_loader = self._get_prometheus_loader(cluster)
@@ -308,7 +312,7 @@ async def _collect_result(self) -> Result:
             raise CriticalRunnerException("No successful scans were made. Check the logs for more information.")
 
         return Result(
-            scans=scans,
+            scans=successful_scans,
             description=f"[b]{self._strategy.display_name.title()} Strategy[/b]\n\n{self._strategy.description}",
             strategy=StrategyData(
                 name=str(self._strategy).lower(),