Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 84 additions & 15 deletions holmes/plugins/toolsets/datadog/toolset_datadog_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
LoggingCapability,
PodLoggingTool,
)
from holmes.core.tools import ToolParameter
from holmes.plugins.toolsets.utils import process_timestamps_to_rfc3339


Expand Down Expand Up @@ -68,6 +69,31 @@ def calculate_page_size(
return min(dd_config.page_size, max(0, max_logs_count - logs_count))


def build_datadog_query(
params: FetchPodLogsParams,
dd_config: DatadogLogsConfig,
) -> str:
"""Build Datadog query string from parameters, handling wildcards properly."""
query_parts = []

# Only add namespace filter if not empty or single wildcard
if params.namespace and params.namespace != "*":
query_parts.append(f"{dd_config.labels.namespace}:{params.namespace}")

# Only add pod filter if not empty or single wildcard
if params.pod_name and params.pod_name != "*":
query_parts.append(f"{dd_config.labels.pod}:{params.pod_name}")

# Start with base query or use "*" if no specific filters
query = " ".join(query_parts) if query_parts else "*"

if params.filter:
filter = params.filter.replace('"', '\\"')
query += f' "{filter}"'

return query


def fetch_paginated_logs(
params: FetchPodLogsParams,
dd_config: DatadogLogsConfig,
Expand All @@ -84,11 +110,8 @@ def fetch_paginated_logs(
url = f"{dd_config.site_api_url}/api/v2/logs/events/search"
headers = get_headers(dd_config)

query = f"{dd_config.labels.namespace}:{params.namespace}"
query += f" {dd_config.labels.pod}:{params.pod_name}"
if params.filter:
filter = params.filter.replace('"', '\\"')
query += f' "{filter}"'
# Build query using helper function
query = build_datadog_query(params, dd_config)

payload: Dict[str, Any] = {
"filter": {
Expand Down Expand Up @@ -169,12 +192,8 @@ def generate_datadog_logs_url(
# Convert API URL to app URL using the shared helper
base_url = convert_api_url_to_app_url(dd_config.site_api_url)

# Build the query string
query = f"{dd_config.labels.namespace}:{params.namespace}"
query += f" {dd_config.labels.pod}:{params.pod_name}"
if params.filter:
filter = params.filter.replace('"', '\\"')
query += f' "{filter}"'
# Build query using helper function
query = build_datadog_query(params, dd_config)

# Process timestamps - get Unix timestamps in seconds
(from_time_seconds, to_time_seconds) = process_timestamps_to_int(
Expand Down Expand Up @@ -204,6 +223,24 @@ def generate_datadog_logs_url(
return f"{base_url}/logs?{urlencode(url_params)}"


class DatadogPodLoggingTool(PodLoggingTool):
"""Custom pod logging tool for Datadog with wildcard support"""

def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict:
"""Override to add wildcard support to pod_name parameter"""
# Get base parameters from parent
params = super()._get_tool_parameters(toolset)

# Override pod_name description to indicate wildcard support
params["pod_name"] = ToolParameter(
description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods",
type="string",
required=True,
)

return params


class DatadogLogsToolset(BasePodLoggingToolset):
dd_config: Optional[DatadogLogsConfig] = None

Expand All @@ -225,7 +262,8 @@ def __init__(self):
tags=[ToolsetTag.CORE],
)
# Now that parent is initialized and self.name exists, create the tool
self.tools = [PodLoggingTool(self)]
# Use our custom DatadogPodLoggingTool with wildcard support
self.tools = [DatadogPodLoggingTool(self)]
self._reload_instructions()

def logger_name(self) -> str:
Expand Down Expand Up @@ -397,23 +435,54 @@ def _perform_healthcheck(self) -> Tuple[bool, str]:
"""
try:
logging.debug("Performing Datadog configuration healthcheck...")
# Use wildcards which are now properly handled by the query builder
# The query builder will detect "*" and create a broad search query
healthcheck_params = FetchPodLogsParams(
namespace="*",
pod_name="*",
limit=1,
start_time="-172800", # 48 hours in seconds
)

# Calculate actual timestamps for debugging
from holmes.plugins.toolsets.utils import process_timestamps_to_int
from datetime import datetime

(from_time, to_time) = process_timestamps_to_int(
start=healthcheck_params.start_time,
end=healthcheck_params.end_time,
default_time_span_seconds=86400,
)
from_dt = datetime.fromtimestamp(from_time)
to_dt = datetime.fromtimestamp(to_time)

logging.info(
f"DEBUG: Running Datadog healthcheck with params: namespace={healthcheck_params.namespace}, pod_name={healthcheck_params.pod_name}, start_time={healthcheck_params.start_time}"
)
logging.info(
f"DEBUG: Healthcheck time range: from {from_dt.isoformat()} to {to_dt.isoformat()}"
)
if self.dd_config:
logging.info(
f"DEBUG: Healthcheck query will be sent to: {self.dd_config.site_api_url}"
)

result = self.fetch_pod_logs(healthcheck_params)
logging.info(
f"DEBUG: Healthcheck result status: {result.status}, error: {result.error}, data length: {len(result.data or '')}"
)

if result.status == StructuredToolResultStatus.ERROR:
error_msg = result.error or "Unknown error during healthcheck"
logging.error(f"Datadog healthcheck failed: {error_msg}")
return False, f"Datadog healthcheck failed: {error_msg}"
elif result.status == StructuredToolResultStatus.NO_DATA:
error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?"
logging.error(f"Datadog healthcheck failed: {error_msg}")
return False, f"Datadog healthcheck failed: {error_msg}"
# NO_DATA is acceptable for healthcheck - it just means no logs exist yet
# The important thing is that the API call succeeded
logging.info(
"Datadog healthcheck completed successfully (no data found, but API is accessible)"
)
return True, ""

logging.info("Datadog healthcheck completed successfully")
return True, ""
Expand Down
23 changes: 21 additions & 2 deletions holmes/plugins/toolsets/grafana/loki_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,15 @@ def execute_loki_query(
return []

except requests.exceptions.RequestException as e:
raise Exception(f"Failed to query Loki logs: {str(e)}")
error_details = (
f"Failed to query Loki logs:\n"
f" URL: {url}\n"
f" Query: {query}\n"
f" Time range: {start} to {end}\n"
f" Limit: {limit}\n"
f" Error: {str(e)}"
)
raise Exception(error_details)
Comment on lines +62 to +70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add exception chaining to preserve context.

The enhanced error message with query details is excellent for debugging. However, line 70 should use raise ... from e to preserve the exception chain, which is important for debugging and traceback clarity.

Apply this diff:

         error_details = (
             f"Failed to query Loki logs:\n"
             f"  URL: {url}\n"
             f"  Query: {query}\n"
             f"  Time range: {start} to {end}\n"
             f"  Limit: {limit}\n"
-            f"  Error: {str(e)}"
+            f"  Error: {e!s}"
         )
-        raise Exception(error_details)
+        raise Exception(error_details) from e

Optional improvements:

  • Line 68: Use {e!s} instead of {str(e)} for cleaner f-string formatting.
  • Line 70: Consider creating a custom LokiQueryError exception class instead of generic Exception for better error handling downstream.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
error_details = (
f"Failed to query Loki logs:\n"
f" URL: {url}\n"
f" Query: {query}\n"
f" Time range: {start} to {end}\n"
f" Limit: {limit}\n"
f" Error: {str(e)}"
)
raise Exception(error_details)
error_details = (
f"Failed to query Loki logs:\n"
f" URL: {url}\n"
f" Query: {query}\n"
f" Time range: {start} to {end}\n"
f" Limit: {limit}\n"
f" Error: {e!s}"
)
raise Exception(error_details) from e
🧰 Tools
🪛 Ruff (0.13.1)

68-68: Use explicit conversion flag

Replace with conversion flag

(RUF010)


70-70: Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling

(B904)


70-70: Create your own exception

(TRY002)

🤖 Prompt for AI Agents
In holmes/plugins/toolsets/grafana/loki_api.py around lines 62 to 70, the code
builds a detailed error_details string and raises a plain Exception without
chaining; update the raise to preserve the original exception by using "raise
Exception(error_details) from e". While editing, replace "{str(e)}" with "{e!s}"
in the f-string for cleaner formatting, and optionally consider defining and
raising a custom LokiQueryError instead of a generic Exception for clearer
downstream handling.



def query_loki_logs_by_label(
Expand All @@ -75,7 +83,18 @@ def query_loki_logs_by_label(
namespace_search_key: str = "namespace",
limit: int = 200,
) -> List[Dict]:
query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'
# Handle wildcards: if label_value is "*" or contains wildcards, use regex matching
if label_value == "*":
# Match any value for this label
query = f'{{{namespace_search_key}="{namespace}", {label}=~".+"}}'
elif "*" in label_value:
# Convert wildcard to regex pattern (e.g., "payment-*" -> "payment-.*")
regex_pattern = label_value.replace("*", ".*")
query = f'{{{namespace_search_key}="{namespace}", {label}=~"{regex_pattern}"}}'
else:
# Exact match for the label value
query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'

if filter:
query += f' |= "{filter}"'
return execute_loki_query(
Expand Down
75 changes: 75 additions & 0 deletions holmes/plugins/toolsets/grafana/loki_instructions.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
## Loki Logs Tools Usage Guide

Before running logs queries:

** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
** Find the involved resource name and kind
** If you can't figure out what is the type of the resource, ask the user for more information and don't guess

### General guideline
- This toolset is used to read pod logs from Loki, a log aggregation system
- Assume the pod should have logs. If logs not found, try to adjust the query
- Loki stores historical logs, so you can query logs for pods that no longer exist

### CRITICAL: Pod Name Resolution Workflow

**IMPORTANT WILDCARD USAGE:**
- **ALWAYS use wildcards** when searching for pods unless you have the COMPLETE pod name with all suffixes
- Kubernetes pod names include deployment hash + replica ID (e.g., `nginx-ingress-7b9899-x2km9`, `frontend-5f4d3b2a1-abc123`)
- When user says "nginx pod" or "frontend pod", search for `nginx-*` or `frontend-*` NOT just `nginx` or `frontend`
- Loki supports wildcards: `*` matches any characters (e.g., `nginx-*`, `*ingress*`, `*-x2km9`)
- For partial matches, use wildcards on both sides: `*keyword*` to find logs from any pod containing "keyword"

**When user provides what looks like a complete pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
- Query Loki directly with that exact pod name
- Do NOT try to verify if the pod exists in Kubernetes first
- This allows querying historical pods that have been deleted/replaced

**When user provides a simple/generic name** (e.g., "nginx", "redis", "payment-service", "auth"):
- **DEFAULT ACTION: Use wildcards** - Query with `pod-name-*` pattern
- For historical queries (yesterday, last week): ALWAYS use wildcards directly in Loki
- For current issues: Optionally use `kubectl_find_resource` to find exact pod names, but wildcards often work better
- Examples:
- User says "nginx pod" → Query Loki with `nginx-*`
- User says "redis instance" → Query Loki with `redis-*`
- User says "payment service" → Query Loki with `payment-*`

**Why wildcards are critical:**
- Pod names in Loki are the actual Kubernetes pod names (with random suffixes)
- Users typically refer to pods by their deployment/service name without suffixes
- Without wildcards, queries for "nginx" will find NOTHING when actual pods are named "nginx-7b9899-x2km9"
- Historical pods that no longer exist can only be found via Loki with proper wildcard usage

### Time Parameters
- Use RFC3339 format: `2023-03-01T10:30:00Z`
- Or relative seconds: `-3600` for 1 hour ago
- Defaults to 7 days window if not specified

### Filter Usage
- The `filter` parameter performs substring matching on log content
- Use it to search for specific error messages, keywords, or patterns
- Example: filter="error" will return only logs containing the word "error"

### Common Investigation Patterns

**For Current Pod Issues:**
1. User asks: "Show logs for payment service"
2. Query Loki with pod_name="payment-*" to get all pods matching that pattern
3. Apply time ranges and filters as needed

**For Historical Pod Issues:**
1. User asks: "What happened to the payment-api pod yesterday at 2pm?"
2. Query Loki directly with pod_name="payment-*" and appropriate time range
3. No need to verify pod existence - Loki has the historical data

**For Debugging Deleted/Restarted Pods:**
1. Pod may no longer exist in cluster
2. Use wildcards to find all historical logs for that workload
3. Loki retains logs even after pods are gone

### Important Notes
- Always inform the user about the actual time period fetched
- If a limit was applied, tell the user how many logs were shown
- If filters were applied, mention them explicitly
- If no logs are found, suggest broader wildcards or time ranges
36 changes: 34 additions & 2 deletions holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import Any, cast, Set
from pydantic import BaseModel

Expand All @@ -23,7 +24,11 @@
from holmes.plugins.toolsets.grafana.loki_api import (
query_loki_logs_by_label,
)
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
from holmes.core.tools import (
StructuredToolResult,
StructuredToolResultStatus,
ToolParameter,
)


class GrafanaLokiLabelsConfig(BaseModel):
Expand All @@ -35,6 +40,24 @@ class GrafanaLokiConfig(GrafanaConfig):
labels: GrafanaLokiLabelsConfig = GrafanaLokiLabelsConfig()


class LokiPodLoggingTool(PodLoggingTool):
"""Custom pod logging tool for Loki with wildcard support"""

def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict:
"""Override to add wildcard support to pod_name parameter"""
# Get base parameters from parent
params = super()._get_tool_parameters(toolset)

# Override pod_name description to indicate wildcard support
params["pod_name"] = ToolParameter(
description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods",
type="string",
required=True,
)

return params


class GrafanaLokiToolset(BasePodLoggingToolset):
@property
def supported_capabilities(self) -> Set[LoggingCapability]:
Expand All @@ -51,7 +74,9 @@ def __init__(self):
tools=[], # Initialize with empty tools first
)
# Now that parent is initialized and self.name exists, create the tool
self.tools = [PodLoggingTool(self)]
# Use our custom LokiPodLoggingTool with wildcard support
self.tools = [LokiPodLoggingTool(self)]
self._reload_instructions()

def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]:
if not config:
Expand Down Expand Up @@ -109,3 +134,10 @@ def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
status=StructuredToolResultStatus.NO_DATA,
params=params.model_dump(),
)

def _reload_instructions(self):
"""Load Loki specific instructions."""
template_file_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), "loki_instructions.jinja2")
)
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
Comment on lines +138 to +143
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Add return type hint for mypy compliance.

The method is missing a return type hint, which is required by the coding guidelines. Add -> None to the method signature.

Apply this diff:

-    def _reload_instructions(self):
+    def _reload_instructions(self) -> None:
         """Load Loki specific instructions."""

Verify the loki_instructions.jinja2 template file exists.

Ensure the template file is present in the same directory as this file, as the path construction expects it to be co-located.

Run this script to verify the template file exists:


🏁 Script executed:

#!/bin/bash
# Verify that loki_instructions.jinja2 exists in the same directory

fd -t f "loki_instructions.jinja2" holmes/plugins/toolsets/grafana/

Length of output: 56


Add missing return type hint for mypy compliance.

Apply this diff:

-    def _reload_instructions(self):
+    def _reload_instructions(self) -> None:
         """Load Loki specific instructions."""
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _reload_instructions(self):
"""Load Loki specific instructions."""
template_file_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), "loki_instructions.jinja2")
)
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
def _reload_instructions(self) -> None:
"""Load Loki specific instructions."""
template_file_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), "loki_instructions.jinja2")
)
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
🤖 Prompt for AI Agents
In holmes/plugins/toolsets/grafana/toolset_grafana_loki.py around lines 138 to
143, the _reload_instructions method is missing a return type hint which fails
mypy; add an explicit return type hint (-> None) to the def signature so it
reads def _reload_instructions(self) -> None:, leaving the body unchanged.

Loading