Skip to content
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
680905d
make dal function to support issues and config changes
RoiGlinik Oct 21, 2025
d24dc10
add new tool for fetching issues only
RoiGlinik Oct 21, 2025
59246ad
add instructions for the new tool
RoiGlinik Oct 21, 2025
fb0b26b
fix typos
RoiGlinik Oct 21, 2025
9d9fddb
Merge branch 'master' into ROB-2290-workload-issues-tool
RoiGlinik Oct 22, 2025
39abd10
minor fixes. improve description. remove fixed external cluster
RoiGlinik Oct 22, 2025
dc6e904
fix copied name
RoiGlinik Oct 22, 2025
9bd38b3
remove finding type ALL for now
RoiGlinik Oct 22, 2025
8588c5a
remove some tools from test to avoid tools limit
RoiGlinik Oct 22, 2025
dd7e272
Merge branch 'master' into ROB-2290-workload-issues-tool
RoiGlinik Oct 23, 2025
1ccc79e
load mock dal for tests
RoiGlinik Oct 23, 2025
7bafa50
load issues json file as issues table
RoiGlinik Oct 23, 2025
c671cb3
mock issues function
RoiGlinik Oct 23, 2025
9595736
fix naming for inheritance
RoiGlinik Oct 23, 2025
5ccc0fc
create base test case for loading issues from history
RoiGlinik Oct 23, 2025
ace628d
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 23, 2025
45ff238
allow mock dal to function without db connection
RoiGlinik Oct 23, 2025
322b1eb
move params to constructor to use required
RoiGlinik Oct 23, 2025
7e3def3
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 26, 2025
b7c4677
fix description
RoiGlinik Oct 26, 2025
d7c338c
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 26, 2025
59006b1
change test to easy
RoiGlinik Oct 27, 2025
bd18cfd
make sure values are set on failure
RoiGlinik Oct 27, 2025
1e965c9
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions holmes/plugins/toolsets/robusta/robusta.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def __init__(
)
self._dal = dal

def _fetch_change_history(
def _fetch_issues(
self,
params: Dict,
cluster: Optional[str] = None,
Expand All @@ -225,7 +225,7 @@ def _fetch_change_history(

def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
try:
changes = self._fetch_change_history(params)
changes = self._fetch_issues(params)
if changes:
return StructuredToolResult(
status=StructuredToolResultStatus.SUCCESS,
Expand Down Expand Up @@ -282,8 +282,8 @@ def __init__(self, dal: Optional[SupabaseDal]):
add_cluster_filter=False,
)

def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_change_history(params, cluster="external")
def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_issues(params, cluster="external")

def get_parameterized_one_liner(self, params: Dict) -> str:
return f"Robusta: Search External Change History {params}"
Expand All @@ -296,14 +296,28 @@ def __init__(self, dal: Optional[SupabaseDal]):
name="fetch_resource_issues_metadata",
description=(
"Fetch issues and alert metadata in a given time range. "
"Must be filtered on a given namespace and specific kubernetes resource such as pod, deployment, job, etc."
"Must be filtered on a given namespace and specific kubernetes resource, such as pod, deployment, job, etc. "
"Use fetch_finding_by_id to get further information on a specific issue or alert."
),
add_cluster_filter=True,
add_cluster_filter=False,
)
self.parameters.update(
{
"namespace": ToolParameter(
description="The Kubernetes namespace name for filtering issues and alerts",
type="string",
required=True,
),
"workload": ToolParameter(
description="Kubernetes resource name to filter issues and alerts (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
type="string",
required=True,
),
}
)

def _fetch_resource_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_change_history(params, finding_type=FindingType.ISSUE)
def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_issues(params, finding_type=FindingType.ISSUE)

def get_parameterized_one_liner(self, params: Dict) -> str:
return f"Robusta: fetch resource issues metadata {params}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
[
{
"id": "91160555-4aaf-4200-9fcf-3ffb93b93c89",
"description": "Back-off restarting failed container memory-eater in pod analytics-exporter-fast-76897854c-xxljr_default(dec2ce4b-a210-485d-8e6c-42b2ca28ecde)",
"source": "kubernetes_api_server",
"category": null,
"priority": "DEBUG",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter-fast",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:27.643325",
"title": "BackOff Warning for Pod default/analytics-exporter-fast-76897854c-xxljr",
"aggregation_key": "PodLifecycleWarning",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": null,
"starts_at": "2025-10-19 10:59:27.393256+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:27.577263+00",
"fingerprint": "adafae9087aaaab1db55131d4815737d4978b99956b71667dccb37bb23f8c1a9",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {},
"annotations": {}
},
{
"id": "7a63a5cf-34e3-4802-a32f-c2a3de14d44a",
"description": null,
"source": "kubernetes_api_server",
"category": null,
"priority": "HIGH",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter-fast",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:56.536309",
"title": "Crashing pod analytics-exporter-fast-76897854c-xxljr in namespace default",
"aggregation_key": "CrashLoopBackoff",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": "aks-agentpool-35525070-vmss000001",
"starts_at": "2025-10-19 10:59:56.0862+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:56.459443+00",
"fingerprint": "71ea894a65ea183323d859d3c91951a3b0dba81a7ecda3a9cef605f23a81514c",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {
"app": "analytics-exporter-fast",
"pod-template-hash": "76897854c"
},
"annotations": {}
},
{
"id": "b517520c-9fa9-491c-9754-821dc5ed1f75",
"description": null,
"source": "kubernetes_api_server",
"category": null,
"priority": "HIGH",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:56.536309",
"title": "Pod analytics-exporter-fast-76897854c-xxljr in namespace default OOMKilled results",
"aggregation_key": "PodOOMKilled",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": "ip-10-0-237-138.us-east-2.compute.internal",
"starts_at": "2025-10-19 10:59:56.536309+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:56.536309+00",
"fingerprint": "3e03802bbc0a878f40772250cf5bb9926b504b0e9429b9ce3373922d9d067d72",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {
"app": "analytics-exporter",
"pod-template-hash": "7fb8857595"
},
"annotations": {
"robusta.kubernetes.io/restartedAt": "2025-10-19T10:59:56.565094+00:00"
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
description: |
A test case that simulates an issue real customer bumped into.
When holmes was asked about a pod that died some time ago and could not find any information about it , it made up an answer about his condition.
This test checks the new fetch_resource_issue_metadata function to allow holmes to get historical issues from our timeline and find the pod kill reason in history.

user_prompt: "why did my pod analytics-exporter-fast-76897854c-xxljr died around 2025-10-19?"
expected_output:
- The result mentions the analytics-exporter-fast pod was OOMKILLED
before_test:
after_test:


test_type: "server"

tags:
- easy
- kubernetes
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
toolsets:
kubernetes/core:
enabled: true
robusta:
enabled: true
kubernetes/logs:
enabled: true
88 changes: 85 additions & 3 deletions tests/llm/utils/mock_dal.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,34 @@
import json
import logging
from pathlib import Path
from typing import Dict, Optional
from typing import Dict, Optional, List

from pydantic import TypeAdapter

from holmes.core.supabase_dal import SupabaseDal
from holmes.core.supabase_dal import SupabaseDal, FindingType
from holmes.core.tool_calling_llm import Instructions, ResourceInstructions
from tests.llm.utils.test_case_utils import read_file
from datetime import datetime, timezone


class MockSupabaseDal(SupabaseDal):
def __init__(
self,
test_case_folder: Path,
issue_data: Optional[Dict],
issues_metadata: Optional[List[Dict]],
resource_instructions: Optional[ResourceInstructions],
generate_mocks: bool,
):
super().__init__(cluster="test")
try:
super().__init__(cluster="test")
except Exception:
logging.warning(
"Mocksupabase dal could not connect to db. Running in pure mock mode. real db calls and --generate-mock will fail."
)
self._issue_data = issue_data
self._resource_instructions = resource_instructions
self._issues_metadata = issues_metadata
self._test_case_folder = test_case_folder
self._generate_mocks = generate_mocks

Expand Down Expand Up @@ -72,6 +80,74 @@ def get_global_instructions_for_account(self) -> Optional[Instructions]:
def get_workload_issues(self, *args) -> list:
return []

def get_issues_metadata(
self,
start_datetime: str,
end_datetime: str,
limit: int = 100,
workload: Optional[str] = None,
ns: Optional[str] = None,
cluster: Optional[str] = None,
finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
) -> Optional[List[Dict]]:
if self._issues_metadata is not None:
filtered_data = []
if not cluster:
cluster = self.cluster
for item in self._issues_metadata:
creation_date, start, end = [
datetime.fromisoformat(dt.replace("Z", "+00:00")).astimezone(
timezone.utc
)
for dt in (item["creation_date"], start_datetime, end_datetime)
]
if not (start <= creation_date <= end):
continue
if item.get("finding_type") != finding_type.value:
continue
if item.get("cluster") != cluster:
continue
if workload:
if item.get("subject_name") != workload:
continue
if ns:
if item.get("subject_namespace") != ns:
continue

filtered_item = {
"id": item.get("id"),
"title": item.get("title"),
"subject_name": item.get("subject_name"),
"subject_namespace": item.get("subject_namespace"),
"subject_type": item.get("subject_type"),
"description": item.get("description"),
"starts_at": item.get("starts_at"),
"ends_at": item.get("ends_at"),
}
filtered_data.append(filtered_item)
filtered_data = filtered_data[:limit]

return filtered_data if filtered_data else None
else:
data = super().get_issues_metadata(
start_datetime, end_datetime, limit, workload, ns, cluster, finding_type
)
if self._generate_mocks:
file_path = self._get_mock_file_path("issues_metadata")

with open(file_path, "w") as f:
f.write(json.dumps(data or {}, indent=2))
f.close()

logging.warning(
f"A mock file was generated for you at {file_path} "
f"with the content of dal.get_issues_metadata("
f"{start_datetime}, {end_datetime}, {limit}, "
f"{workload}, {ns}, {finding_type})"
)

return data


pydantic_resource_instructions = TypeAdapter(ResourceInstructions)

Expand All @@ -82,6 +158,11 @@ def load_mock_dal(test_case_folder: Path, generate_mocks: bool):
if issue_data_mock_path.exists():
issue_data = json.loads(read_file(issue_data_mock_path))

issues_metadata_path = test_case_folder.joinpath(Path("issues_metadata.json"))
issues_metadata = None
if issues_metadata_path.exists():
issues_metadata = json.loads(read_file(issues_metadata_path))

resource_instructions_mock_path = test_case_folder.joinpath(
Path("resource_instructions.json")
)
Expand All @@ -95,5 +176,6 @@ def load_mock_dal(test_case_folder: Path, generate_mocks: bool):
test_case_folder=test_case_folder,
issue_data=issue_data,
resource_instructions=resource_instructions,
issues_metadata=issues_metadata,
generate_mocks=generate_mocks,
)
9 changes: 8 additions & 1 deletion tests/llm/utils/mock_toolset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import threading
from pydantic import BaseModel
import pytest
from tests.llm.utils.mock_dal import load_mock_dal
from pathlib import Path

from holmes.core.tools import (
StructuredToolResult,
Expand Down Expand Up @@ -643,8 +645,13 @@ def _get_toolset_mode(self, toolset_name: str) -> MockMode:

def _initialize_toolsets(self):
"""Initialize and configure toolsets."""

mock_dal = load_mock_dal(
test_case_folder=Path(self.test_case_folder),
generate_mocks=self.mock_generation_config.generate_mocks,
)
# Load builtin toolsets
builtin_toolsets = load_builtin_toolsets()
builtin_toolsets = load_builtin_toolsets(mock_dal)

# Load custom toolsets from YAML if present
config_path = os.path.join(self.test_case_folder, "toolsets.yaml")
Expand Down
Loading