grouped_job_working

Avi-Robusta · Avi-Robusta · commit 0ee1c88977e7 · 2025-10-05T14:26:53.000+03:00
diff --git a/robusta_krr/core/integrations/kubernetes/__init__.py b/robusta_krr/core/integrations/kubernetes/__init__.py
@@ -107,6 +107,7 @@ async def list_scannable_objects(self) -> list[K8sObjectData]:
             self._list_all_daemon_set(),
             self._list_all_jobs(),
             self._list_all_cronjobs(),
+            self._list_all_groupedjobs(),
         )
 
         return [
@@ -146,6 +147,22 @@ async def list_pods(self, object: K8sObjectData) -> list[PodData]:
             ]
             selector = f"batch.kubernetes.io/controller-uid in ({','.join(ownered_jobs_uids)})"
 
+        elif object.kind == "GroupedJob":
+            # For GroupedJob, we need to get pods using the label+value filter
+            if not hasattr(object._api_resource, '_label_filters') or not object._api_resource._label_filters:
+                return []
+            
+            # Use the label+value filter to get pods
+            label_selector = ",".join(object._api_resource._label_filters)
+            ret: V1PodList = await loop.run_in_executor(
+                self.executor,
+                lambda: self.core.list_namespaced_pod(
+                    namespace=object.namespace, label_selector=label_selector
+                ),
+            )
+            
+            return [PodData(name=pod.metadata.name, deleted=False) for pod in ret.items]
+
         else:
             if object.selector is None:
                 return []
@@ -442,15 +459,24 @@ def _list_all_daemon_set(self) -> list[K8sObjectData]:
         )
 
     def _list_all_jobs(self) -> list[K8sObjectData]:
+        def filter_jobs(item):
+            # Skip jobs owned by CronJobs
+            if any(owner.kind == "CronJob" for owner in item.metadata.owner_references or []):
+                return False
+            
+            # Skip jobs that have any of the grouping labels (they will be handled by GroupedJob)
+            if settings.job_grouping_labels and item.metadata.labels:
+                if any(label in item.metadata.labels for label in settings.job_grouping_labels):
+                    return False
+            
+            return True
+        
         return self._list_scannable_objects(
             kind="Job",
             all_namespaces_request=self.batch.list_job_for_all_namespaces,
             namespaced_request=self.batch.list_namespaced_job,
             extract_containers=lambda item: item.spec.template.spec.containers,
-            # NOTE: If the job has ownerReference and it is a CronJob, then we should skip it
-            filter_workflows=lambda item: not any(
-                owner.kind == "CronJob" for owner in item.metadata.owner_references or []
-            ),
+            filter_workflows=filter_jobs,
         )
 
     def _list_all_cronjobs(self) -> list[K8sObjectData]:
@@ -461,6 +487,77 @@ def _list_all_cronjobs(self) -> list[K8sObjectData]:
             extract_containers=lambda item: item.spec.job_template.spec.template.spec.containers,
         )
 
+    async def _list_all_groupedjobs(self) -> list[K8sObjectData]:
+        """List all GroupedJob objects by grouping jobs with the specified labels."""
+        if not settings.job_grouping_labels:
+            logger.debug("No job grouping labels configured, skipping GroupedJob listing")
+            return []
+        
+        if not self._should_list_resource("GroupedJob"):
+            logger.debug("Skipping GroupedJob in cluster")
+            return []
+        
+        logger.debug(f"Listing GroupedJobs with grouping labels: {settings.job_grouping_labels}")
+        
+        # Get all jobs that have any of the grouping labels
+        all_jobs = await self._list_namespaced_or_global_objects(
+            kind="Job",
+            all_namespaces_request=self.batch.list_job_for_all_namespaces,
+            namespaced_request=self.batch.list_namespaced_job,
+        )
+        
+        # Group jobs by individual grouping label values AND namespace (OR logic)
+        grouped_jobs = defaultdict(list)
+        for job in all_jobs:
+            if (job.metadata.labels and 
+                not any(owner.kind == "CronJob" for owner in job.metadata.owner_references or [])):
+                
+                # Check if job has any of the grouping labels
+                for label_name in settings.job_grouping_labels:
+                    if label_name in job.metadata.labels:
+                        label_value = job.metadata.labels[label_name]
+                        group_key = f"{job.metadata.namespace}/{label_name}={label_value}"
+                        grouped_jobs[group_key].append(job)
+        
+        # Create GroupedJob objects
+        result = []
+        for group_name, jobs in grouped_jobs.items():
+            # Use the first job as the template for the group
+            template_job = jobs[0]
+            
+            # Create a virtual container that represents the group
+            # We'll use the first job's container as the template
+            template_container = template_job.spec.template.spec.containers[0]
+            
+            # Create the GroupedJob object
+            grouped_job = self.__build_scannable_object(
+                item=template_job,
+                container=template_container,
+                kind="GroupedJob"
+            )
+            
+            # Override the name to be the group name
+            grouped_job.name = group_name
+            grouped_job.namespace = template_job.metadata.namespace
+            
+            # Store all jobs in the group for later pod listing
+            grouped_job._api_resource._grouped_jobs = jobs
+            
+            # Store the label+value filter for pod listing
+            # Extract the label+value pair from the group name
+            grouped_job._api_resource._label_filters = []
+            # The group name is in format "namespace/label_name=label_value"
+            # Extract just the label=value part for the selector
+            if "/" in group_name and "=" in group_name:
+                # Split by "/" and take everything after the first "/"
+                namespace_part, label_part = group_name.split("/", 1)
+                grouped_job._api_resource._label_filters.append(label_part)
+            
+            result.append(grouped_job)
+        
+        logger.debug(f"Found {len(result)} GroupedJob groups")
+        return result
+
     async def __list_hpa_v1(self) -> dict[HPAKey, HPAData]:
         loop = asyncio.get_running_loop()
         res = await loop.run_in_executor(
diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py
@@ -107,6 +107,7 @@ def __init__(
         elif not settings.inside_cluster and self.api_client is not None:
             self.api_client.update_params_for_auth(headers, {}, ["BearerToken"])
         self.prom_config = generate_prometheus_config(url=self.url, headers=headers, metrics_service=self)
+        self.prometheus = None
         self.get_prometheus()
 
     def get_prometheus(self):
diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py
@@ -52,6 +52,9 @@ class Config(pd.BaseSettings):
 
     # Threading settings
     max_workers: int = pd.Field(6, ge=1)
+    
+    # Job grouping settings
+    job_grouping_labels: Union[list[str], str, None] = pd.Field(None, description="Label name(s) to use for grouping jobs into GroupedJob workload type")
 
     # Logging Settings
     format: str
@@ -130,6 +133,15 @@ def validate_resources(cls, v: Union[list[str], Literal["*"]]) -> Union[list[str
         # So this will preserve the big and small letters of the resource
         return [next(r for r in KindLiteral.__args__ if r.lower() == val.lower()) for val in v]
 
+    @pd.validator("job_grouping_labels", pre=True)
+    def validate_job_grouping_labels(cls, v: Union[list[str], str, None]) -> Union[list[str], None]:
+        if v is None:
+            return None
+        if isinstance(v, str):
+            # Split comma-separated string into list
+            return [label.strip() for label in v.split(',')]
+        return v
+
     def create_strategy(self) -> AnyStrategy:
         StrategyType = AnyStrategy.find(self.strategy)
         StrategySettingsType = StrategyType.get_settings_type()
diff --git a/robusta_krr/core/models/objects.py b/robusta_krr/core/models/objects.py
@@ -8,7 +8,7 @@
 from robusta_krr.utils.batched import batched
 from kubernetes.client.models import V1LabelSelector
 
-KindLiteral = Literal["Deployment", "DaemonSet", "StatefulSet", "Job", "CronJob", "Rollout", "DeploymentConfig", "StrimziPodSet"]
+KindLiteral = Literal["Deployment", "DaemonSet", "StatefulSet", "Job", "CronJob", "Rollout", "DeploymentConfig", "StrimziPodSet", "GroupedJob"]
 
 
 class PodData(pd.BaseModel):
diff --git a/robusta_krr/main.py b/robusta_krr/main.py
@@ -220,6 +220,12 @@ def run_strategy(
                     help="Max workers to use for async requests.",
                     rich_help_panel="Threading Settings",
                 ),
+                job_grouping_labels: Optional[str] = typer.Option(
+                    None,
+                    "--job-grouping-labels",
+                    help="Label name(s) to use for grouping jobs into GroupedJob workload type. Can be a single label or comma-separated labels (e.g., 'app,team').",
+                    rich_help_panel="Job Grouping Settings",
+                ),
                 format: str = typer.Option(
                     "table",
                     "--formatter",
@@ -357,6 +363,7 @@ def run_strategy(
                         coralogix_token=coralogix_token,
                         openshift=openshift,
                         max_workers=max_workers,
+                        job_grouping_labels=job_grouping_labels,
                         format=format,
                         show_cluster_name=show_cluster_name,
                         verbose=verbose,