Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
680905d
make dal function to support issues and config changes
RoiGlinik Oct 21, 2025
d24dc10
add new tool for fetching issues only
RoiGlinik Oct 21, 2025
59246ad
add instructions for the new tool
RoiGlinik Oct 21, 2025
fb0b26b
fix typos
RoiGlinik Oct 21, 2025
9d9fddb
Merge branch 'master' into ROB-2290-workload-issues-tool
RoiGlinik Oct 22, 2025
39abd10
minor fixes. improve description. remove fixed external cluster
RoiGlinik Oct 22, 2025
dc6e904
fix copied name
RoiGlinik Oct 22, 2025
9bd38b3
remove finding type ALL for now
RoiGlinik Oct 22, 2025
8588c5a
remove some tools from test to avoid tools limit
RoiGlinik Oct 22, 2025
dd7e272
Merge branch 'master' into ROB-2290-workload-issues-tool
RoiGlinik Oct 23, 2025
1ccc79e
load mock dal for tests
RoiGlinik Oct 23, 2025
7bafa50
load issues json file as issues table
RoiGlinik Oct 23, 2025
c671cb3
mock issues function
RoiGlinik Oct 23, 2025
9595736
fix naming for inheritance
RoiGlinik Oct 23, 2025
5ccc0fc
create base test case for loading issues from history
RoiGlinik Oct 23, 2025
ace628d
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 23, 2025
45ff238
allow mock dal to function without db connection
RoiGlinik Oct 23, 2025
322b1eb
move params to constructor to use required
RoiGlinik Oct 23, 2025
7e3def3
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 26, 2025
b7c4677
fix description
RoiGlinik Oct 26, 2025
d7c338c
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 26, 2025
59006b1
change test to easy
RoiGlinik Oct 27, 2025
bd18cfd
make sure values are set on failure
RoiGlinik Oct 27, 2025
1e965c9
Merge branch 'master' into ROB-2290-workload-issues-testing
RoiGlinik Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions holmes/core/supabase_dal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import threading
from datetime import datetime, timedelta
from enum import Enum
from typing import Dict, List, Optional, Tuple
from uuid import uuid4

Expand Down Expand Up @@ -53,6 +54,11 @@
ENRICHMENT_BLACKLIST_SET = set(ENRICHMENT_BLACKLIST)


class FindingType(str, Enum):
ISSUE = "issue"
CONFIGURATION_CHANGE = "configuration_change"


class RobustaToken(BaseModel):
store_url: str
api_key: str
Expand Down Expand Up @@ -237,14 +243,15 @@ def get_resource_recommendation(
logging.exception("Supabase error while retrieving efficiency data")
return None

def get_configuration_changes_metadata(
def get_issues_metadata(
self,
start_datetime: str,
end_datetime: str,
limit: int = 100,
workload: Optional[str] = None,
ns: Optional[str] = None,
cluster: Optional[str] = None,
finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
) -> Optional[List[Dict]]:
if not self.enabled:
return []
Expand All @@ -265,12 +272,12 @@ def get_configuration_changes_metadata(
)
.eq("account_id", self.account_id)
.eq("cluster", cluster)
.eq("finding_type", "configuration_change")
.gte("creation_date", start_datetime)
.lte("creation_date", end_datetime)
.limit(limit)
)

query = query.eq("finding_type", finding_type.value)
if workload:
query.eq("subject_name", workload)
if ns:
Expand Down
61 changes: 50 additions & 11 deletions holmes/plugins/toolsets/robusta/robusta.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from typing import Optional, Dict, Any, List
from holmes.common.env_vars import load_bool
from holmes.core.supabase_dal import SupabaseDal
from holmes.core.supabase_dal import SupabaseDal, FindingType
from holmes.core.tools import (
StaticPrerequisite,
Tool,
Expand Down Expand Up @@ -168,7 +168,7 @@ def __init__(
required=True,
),
END_TIME: ToolParameter(
description="The starting time boundary for the search period. String in RFC3339 format.",
description="The ending time boundary for the search period. String in RFC3339 format.",
type="string",
required=True,
),
Expand All @@ -188,7 +188,7 @@ def __init__(
required=False,
),
"workload": ToolParameter(
description="The kubernetes workload name for filtering configuration changes. Deployment name or Pod name for example.",
description="Kubernetes resource name to filter configuration changes (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
type="string",
required=False,
),
Expand All @@ -202,11 +202,14 @@ def __init__(
)
self._dal = dal

def _fetch_change_history(
self, params: Dict, cluster: Optional[str] = None
def _fetch_issues(
self,
params: Dict,
cluster: Optional[str] = None,
finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
) -> Optional[List[Dict]]:
if self._dal and self._dal.enabled:
return self._dal.get_configuration_changes_metadata(
return self._dal.get_issues_metadata(
start_datetime=params["start_datetime"],
end_datetime=params["end_datetime"],
limit=min(
Expand All @@ -216,12 +219,13 @@ def _fetch_change_history(
ns=params.get("namespace"),
workload=params.get("workload"),
cluster=cluster,
finding_type=finding_type,
)
return None

def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
try:
changes = self._fetch_change_history(params)
changes = self._fetch_issues(params)
if changes:
return StructuredToolResult(
status=StructuredToolResultStatus.SUCCESS,
Expand All @@ -231,7 +235,7 @@ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolRes
else:
return StructuredToolResult(
status=StructuredToolResultStatus.NO_DATA,
data=f"Could not find changes for {params}",
data=f"{self.name} found no data. {params}",
params=params,
)
except Exception as e:
Expand All @@ -254,7 +258,7 @@ def __init__(self, dal: Optional[SupabaseDal]):
name="fetch_configuration_changes_metadata",
description=(
"Fetch configuration changes metadata in a given time range. "
"By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload. "
"By default, fetch all cluster changes. Can be filtered on a given namespace or a specific kubernetes resource. "
"Use fetch_finding_by_id to get detailed change of one specific configuration change."
),
)
Expand All @@ -278,13 +282,47 @@ def __init__(self, dal: Optional[SupabaseDal]):
add_cluster_filter=False,
)

def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_change_history(params, cluster="external")
def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_issues(params, cluster="external")

def get_parameterized_one_liner(self, params: Dict) -> str:
return f"Robusta: Search External Change History {params}"


class FetchResourceIssuesMetadata(FetchConfigurationChangesMetadataBase):
def __init__(self, dal: Optional[SupabaseDal]):
super().__init__(
dal=dal,
name="fetch_resource_issues_metadata",
description=(
"Fetch issues and alert metadata in a given time range. "
"Must be filtered on a given namespace and specific kubernetes resource such as pod, deployment, job, etc."
"Use fetch_finding_by_id to get further information on a specific issue or alert."
),
add_cluster_filter=False,
)
self.parameters.update(
{
"namespace": ToolParameter(
description="The Kubernetes namespace name for filtering configuration changes",
type="string",
required=True,
),
"workload": ToolParameter(
description="Kubernetes resource name to filter configuration changes (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
type="string",
required=True,
),
}
)

def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
return super()._fetch_issues(params, finding_type=FindingType.ISSUE)

def get_parameterized_one_liner(self, params: Dict) -> str:
return f"Robusta: fetch resource issues metadata {params}"


class RobustaToolset(Toolset):
def __init__(self, dal: Optional[SupabaseDal]):
dal_prereq = StaticPrerequisite(
Expand All @@ -300,6 +338,7 @@ def __init__(self, dal: Optional[SupabaseDal]):
FetchRobustaFinding(dal),
FetchConfigurationChangesMetadata(dal),
FetchResourceRecommendation(dal),
FetchResourceIssuesMetadata(dal),
]

if PULL_EXTERNAL_FINDINGS:
Expand Down
2 changes: 2 additions & 0 deletions holmes/plugins/toolsets/robusta/robusta_instructions.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* If a change seems important to the investigation, Use fetch_finding_by_id with the configuration change ID to get full details of the change.
* You must ALWAYS call fetch_configuration_changes_metadata when investigating an alert
* Never respond without calling fetch_configuration_changes_metadata
* When investigating a resource (pod, deployment, or job), if no relevant information is available from the live cluster at the time of investigation, call the fetch_resource_issues_metadata function to retrieve its historical alert data.
* You can use fetch_resource_issues_metadata to get issues context for a specific kubernetes resource. Start with a 4 hours window and try to expand to 24 hours windows if nothing comes up.
* When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
* If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
For example:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
[
{
"id": "91160555-4aaf-4200-9fcf-3ffb93b93c89",
"description": "Back-off restarting failed container memory-eater in pod analytics-exporter-fast-76897854c-xxljr_default(dec2ce4b-a210-485d-8e6c-42b2ca28ecde)",
"source": "kubernetes_api_server",
"category": null,
"priority": "DEBUG",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter-fast",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:27.643325",
"title": "BackOff Warning for Pod default/analytics-exporter-fast-76897854c-xxljr",
"aggregation_key": "PodLifecycleWarning",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": null,
"starts_at": "2025-10-19 10:59:27.393256+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:27.577263+00",
"fingerprint": "adafae9087aaaab1db55131d4815737d4978b99956b71667dccb37bb23f8c1a9",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {},
"annotations": {}
},
{
"id": "7a63a5cf-34e3-4802-a32f-c2a3de14d44a",
"description": null,
"source": "kubernetes_api_server",
"category": null,
"priority": "HIGH",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter-fast",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:56.536309",
"title": "Crashing pod analytics-exporter-fast-76897854c-xxljr in namespace default",
"aggregation_key": "CrashLoopBackoff",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": "aks-agentpool-35525070-vmss000001",
"starts_at": "2025-10-19 10:59:56.0862+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:56.459443+00",
"fingerprint": "71ea894a65ea183323d859d3c91951a3b0dba81a7ecda3a9cef605f23a81514c",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {
"app": "analytics-exporter-fast",
"pod-template-hash": "76897854c"
},
"annotations": {}
},
{
"id": "b517520c-9fa9-491c-9754-821dc5ed1f75",
"description": null,
"source": "kubernetes_api_server",
"category": null,
"priority": "HIGH",
"account_id": "16ecba1a-7993-4dd1-a98c-d201462ccba7",
"subject_type": "pod",
"subject_name": "analytics-exporter-fast-76897854c-xxljr",
"service_key": "default/Deployment/analytics-exporter",
"subject_namespace": "default",
"cluster": "test",
"creation_date": "2025-10-19 10:59:56.536309",
"title": "Pod analytics-exporter-fast-76897854c-xxljr in namespace default OOMKilled results",
"aggregation_key": "PodOOMKilled",
"finding_type": "issue",
"failure": true,
"group_id": null,
"subject_node": "ip-10-0-237-138.us-east-2.compute.internal",
"starts_at": "2025-10-19 10:59:56.536309+00",
"ends_at": null,
"updated_at": "2025-10-19 10:59:56.536309+00",
"fingerprint": "3e03802bbc0a878f40772250cf5bb9926b504b0e9429b9ce3373922d9d067d72",
"video_links": [],
"service_kind": null,
"service_name": null,
"labels": {
"app": "analytics-exporter",
"pod-template-hash": "7fb8857595"
},
"annotations": {
"robusta.kubernetes.io/restartedAt": "2025-10-19T10:59:56.565094+00:00"
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
description: |
A test case that simulates an issue real customer bumped into.
When holmes was asked about a pod that died some time ago and could not find any information about it , it made up an answer about his condition.
This test checks the new fetch_resource_issue_metadata function to allow holmes to get historical issues from our timeline and find the pod kill reason in history.
user_prompt: "why did my pod analytics-exporter-fast-76897854c-xxljr died around 2025-10-19?"
expected_output:
- The result mentions the analytics-exporter-fast pod was OOMKILLED
before_test:
after_test:


test_type: "server"

tags:
- medium
- kubernetes
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
toolsets:
kubernetes/core:
enabled: true
robusta:
enabled: true
kubernetes/logs:
enabled: true
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
toolsets:
prometheus/metrics:
enabled: False
kubernetes/kube-lineage-extras:
enabled: true
kubernetes/logs:
enabled: False
kubernetes/core:
enabled: true
datadog/logs:
enabled: True
datadog/metrics:
enabled: True
datadog/traces:
enabled: True
Loading
Loading