diff --git a/holmes/plugins/toolsets/robusta/robusta.py b/holmes/plugins/toolsets/robusta/robusta.py index c2315d4cd..655a89fbf 100644 --- a/holmes/plugins/toolsets/robusta/robusta.py +++ b/holmes/plugins/toolsets/robusta/robusta.py @@ -202,7 +202,7 @@ def __init__( ) self._dal = dal - def _fetch_change_history( + def _fetch_issues( self, params: Dict, cluster: Optional[str] = None, @@ -225,7 +225,7 @@ def _fetch_change_history( def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult: try: - changes = self._fetch_change_history(params) + changes = self._fetch_issues(params) if changes: return StructuredToolResult( status=StructuredToolResultStatus.SUCCESS, @@ -282,8 +282,8 @@ def __init__(self, dal: Optional[SupabaseDal]): add_cluster_filter=False, ) - def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]: # type: ignore - return super()._fetch_change_history(params, cluster="external") + def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore + return super()._fetch_issues(params, cluster="external") def get_parameterized_one_liner(self, params: Dict) -> str: return f"Robusta: Search External Change History {params}" @@ -296,14 +296,28 @@ def __init__(self, dal: Optional[SupabaseDal]): name="fetch_resource_issues_metadata", description=( "Fetch issues and alert metadata in a given time range. " - "Must be filtered on a given namespace and specific kubernetes resource such as pod, deployment, job, etc." + "Must be filtered on a given namespace and specific kubernetes resource, such as pod, deployment, job, etc. " "Use fetch_finding_by_id to get further information on a specific issue or alert." ), - add_cluster_filter=True, + add_cluster_filter=False, + ) + self.parameters.update( + { + "namespace": ToolParameter( + description="The Kubernetes namespace name for filtering issues and alerts", + type="string", + required=True, + ), + "workload": ToolParameter( + description="Kubernetes resource name to filter issues and alerts (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.", + type="string", + required=True, + ), + } ) - def _fetch_resource_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore - return super()._fetch_change_history(params, finding_type=FindingType.ISSUE) + def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore + return super()._fetch_issues(params, finding_type=FindingType.ISSUE) def get_parameterized_one_liner(self, params: Dict) -> str: return f"Robusta: fetch resource issues metadata {params}" diff --git a/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/issues_metadata.json b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/issues_metadata.json new file mode 100644 index 000000000..2ddc4f041 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/issues_metadata.json @@ -0,0 +1,97 @@ +[ + { + "id": "91160555-4aaf-4200-9fcf-3ffb93b93c89", + "description": "Back-off restarting failed container memory-eater in pod analytics-exporter-fast-76897854c-xxljr_default(dec2ce4b-a210-485d-8e6c-42b2ca28ecde)", + "source": "kubernetes_api_server", + "category": null, + "priority": "DEBUG", + "account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7", + "subject_type": "pod", + "subject_name": "analytics-exporter-fast-76897854c-xxljr", + "service_key": "default/Deployment/analytics-exporter-fast", + "subject_namespace": "default", + "cluster": "test", + "creation_date": "2025-10-19 10:59:27.643325", + "title": "BackOff Warning for Pod default/analytics-exporter-fast-76897854c-xxljr", + "aggregation_key": "PodLifecycleWarning", + "finding_type": "issue", + "failure": true, + "group_id": null, + "subject_node": null, + "starts_at": "2025-10-19 10:59:27.393256+00", + "ends_at": null, + "updated_at": "2025-10-19 10:59:27.577263+00", + "fingerprint": "adafae9087aaaab1db55131d4815737d4978b99956b71667dccb37bb23f8c1a9", + "video_links": [], + "service_kind": null, + "service_name": null, + "labels": {}, + "annotations": {} + }, + { + "id": "7a63a5cf-34e3-4802-a32f-c2a3de14d44a", + "description": null, + "source": "kubernetes_api_server", + "category": null, + "priority": "HIGH", + "account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7", + "subject_type": "pod", + "subject_name": "analytics-exporter-fast-76897854c-xxljr", + "service_key": "default/Deployment/analytics-exporter-fast", + "subject_namespace": "default", + "cluster": "test", + "creation_date": "2025-10-19 10:59:56.536309", + "title": "Crashing pod analytics-exporter-fast-76897854c-xxljr in namespace default", + "aggregation_key": "CrashLoopBackoff", + "finding_type": "issue", + "failure": true, + "group_id": null, + "subject_node": "aks-agentpool-35525070-vmss000001", + "starts_at": "2025-10-19 10:59:56.0862+00", + "ends_at": null, + "updated_at": "2025-10-19 10:59:56.459443+00", + "fingerprint": "71ea894a65ea183323d859d3c91951a3b0dba81a7ecda3a9cef605f23a81514c", + "video_links": [], + "service_kind": null, + "service_name": null, + "labels": { + "app": "analytics-exporter-fast", + "pod-template-hash": "76897854c" + }, + "annotations": {} + }, + { + "id": "b517520c-9fa9-491c-9754-821dc5ed1f75", + "description": null, + "source": "kubernetes_api_server", + "category": null, + "priority": "HIGH", + "account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7", + "subject_type": "pod", + "subject_name": "analytics-exporter-fast-76897854c-xxljr", + "service_key": "default/Deployment/analytics-exporter", + "subject_namespace": "default", + "cluster": "test", + "creation_date": "2025-10-19 10:59:56.536309", + "title": "Pod analytics-exporter-fast-76897854c-xxljr in namespace default OOMKilled results", + "aggregation_key": "PodOOMKilled", + "finding_type": "issue", + "failure": true, + "group_id": null, + "subject_node": "ip-10-0-237-138.us-east-2.compute.internal", + "starts_at": "2025-10-19 10:59:56.536309+00", + "ends_at": null, + "updated_at": "2025-10-19 10:59:56.536309+00", + "fingerprint": "3e03802bbc0a878f40772250cf5bb9926b504b0e9429b9ce3373922d9d067d72", + "video_links": [], + "service_kind": null, + "service_name": null, + "labels": { + "app": "analytics-exporter", + "pod-template-hash": "7fb8857595" + }, + "annotations": { + "robusta.kubernetes.io/restartedAt": "2025-10-19T10:59:56.565094+00:00" + } + } +] diff --git a/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/test_case.yaml new file mode 100644 index 000000000..8e958bf21 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/test_case.yaml @@ -0,0 +1,17 @@ +description: | + A test case that simulates an issue real customer bumped into. + When holmes was asked about a pod that died some time ago and could not find any information about it , it made up an answer about his condition. + This test checks the new fetch_resource_issue_metadata function to allow holmes to get historical issues from our timeline and find the pod kill reason in history. + +user_prompt: "why did my pod analytics-exporter-fast-76897854c-xxljr died around 2025-10-19?" +expected_output: + - The result mentions the analytics-exporter-fast pod was OOMKILLED +before_test: +after_test: + + +test_type: "server" + +tags: + - easy + - kubernetes diff --git a/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/toolsets.yaml new file mode 100644 index 000000000..9d89a2174 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/18_oom_kill_from_issues_history/toolsets.yaml @@ -0,0 +1,7 @@ +toolsets: + kubernetes/core: + enabled: true + robusta: + enabled: true + kubernetes/logs: + enabled: true diff --git a/tests/llm/utils/mock_dal.py b/tests/llm/utils/mock_dal.py index bc2893142..d8ab97be3 100644 --- a/tests/llm/utils/mock_dal.py +++ b/tests/llm/utils/mock_dal.py @@ -6,11 +6,12 @@ from pydantic import TypeAdapter -from holmes.core.supabase_dal import SupabaseDal +from holmes.core.supabase_dal import SupabaseDal, FindingType from holmes.core.tool_calling_llm import ResourceInstructions from holmes.plugins.runbooks import RobustaRunbookInstruction from holmes.utils.global_instructions import Instructions from tests.llm.utils.test_case_utils import read_file +from datetime import datetime, timezone class MockSupabaseDal(SupabaseDal): @@ -18,12 +19,20 @@ def __init__( self, test_case_folder: Path, issue_data: Optional[Dict], + issues_metadata: Optional[List[Dict]], resource_instructions: Optional[ResourceInstructions], generate_mocks: bool, initialize_base: bool = True, ): if initialize_base: - super().__init__(cluster="test") + try: + super().__init__(cluster="test") + except: # noqa: E722 + self.enabled = True + self.cluster = "test" + logging.warning( + "Mocksupabase dal could not connect to db. Running in pure mock mode. real db calls and --generate-mock will fail." + ) else: # For only using mock data without initializing the base class # Don't call super().__init__ to avoid initializing Supabase connection @@ -33,6 +42,7 @@ def __init__( self._issue_data = issue_data self._resource_instructions = resource_instructions + self._issues_metadata = issues_metadata self._test_case_folder = test_case_folder self._generate_mocks = generate_mocks @@ -121,6 +131,74 @@ def get_global_instructions_for_account(self) -> Optional[Instructions]: def get_workload_issues(self, *args) -> list: return [] + def get_issues_metadata( + self, + start_datetime: str, + end_datetime: str, + limit: int = 100, + workload: Optional[str] = None, + ns: Optional[str] = None, + cluster: Optional[str] = None, + finding_type: FindingType = FindingType.CONFIGURATION_CHANGE, + ) -> Optional[List[Dict]]: + if self._issues_metadata is not None: + filtered_data = [] + if not cluster: + cluster = self.cluster + for item in self._issues_metadata: + creation_date, start, end = [ + datetime.fromisoformat(dt.replace("Z", "+00:00")).astimezone( + timezone.utc + ) + for dt in (item["creation_date"], start_datetime, end_datetime) + ] + if not (start <= creation_date <= end): + continue + if item.get("finding_type") != finding_type.value: + continue + if item.get("cluster") != cluster: + continue + if workload: + if item.get("subject_name") != workload: + continue + if ns: + if item.get("subject_namespace") != ns: + continue + + filtered_item = { + "id": item.get("id"), + "title": item.get("title"), + "subject_name": item.get("subject_name"), + "subject_namespace": item.get("subject_namespace"), + "subject_type": item.get("subject_type"), + "description": item.get("description"), + "starts_at": item.get("starts_at"), + "ends_at": item.get("ends_at"), + } + filtered_data.append(filtered_item) + filtered_data = filtered_data[:limit] + + return filtered_data if filtered_data else None + else: + data = super().get_issues_metadata( + start_datetime, end_datetime, limit, workload, ns, cluster, finding_type + ) + if self._generate_mocks: + file_path = self._get_mock_file_path("issues_metadata") + + with open(file_path, "w") as f: + f.write(json.dumps(data or {}, indent=2)) + f.close() + + logging.warning( + f"A mock file was generated for you at {file_path} " + f"with the content of dal.get_issues_metadata(" + f"{start_datetime}, {end_datetime}, {limit}, " + f"{workload}, {ns}, {finding_type})" + ) + + return data + pydantic_resource_instructions = TypeAdapter(ResourceInstructions) pydantic_instructions = TypeAdapter(Instructions) @@ -134,6 +212,11 @@ def load_mock_dal( if issue_data_mock_path.exists(): issue_data = json.loads(read_file(issue_data_mock_path)) + issues_metadata_path = test_case_folder.joinpath(Path("issues_metadata.json")) + issues_metadata = None + if issues_metadata_path.exists(): + issues_metadata = json.loads(read_file(issues_metadata_path)) + resource_instructions_mock_path = test_case_folder.joinpath( Path("resource_instructions.json") ) @@ -147,6 +230,7 @@ def load_mock_dal( test_case_folder=test_case_folder, issue_data=issue_data, resource_instructions=resource_instructions, + issues_metadata=issues_metadata, generate_mocks=generate_mocks, initialize_base=initialize_base, ) diff --git a/tests/llm/utils/mock_toolset.py b/tests/llm/utils/mock_toolset.py index 8769e49b3..e25e2cb17 100644 --- a/tests/llm/utils/mock_toolset.py +++ b/tests/llm/utils/mock_toolset.py @@ -645,8 +645,13 @@ def _get_toolset_mode(self, toolset_name: str) -> MockMode: def _initialize_toolsets(self): """Initialize and configure toolsets.""" + + mock_dal = load_mock_dal( + test_case_folder=Path(self.test_case_folder), + generate_mocks=self.mock_generation_config.generate_mocks, + ) # Load builtin toolsets - builtin_toolsets = load_builtin_toolsets() + builtin_toolsets = load_builtin_toolsets(mock_dal) # Load custom toolsets from YAML if present config_path = os.path.join(self.test_case_folder, "toolsets.yaml")