From 199e083aa8cb5afe648c9ad6d804524179536190 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Tue, 30 Sep 2025 07:21:04 +0300
Subject: [PATCH] WIP

---
 .../toolsets/datadog/toolset_datadog_logs.py  | 99 ++++++++++++++++---
 holmes/plugins/toolsets/grafana/loki_api.py   | 23 ++++-
 .../toolsets/grafana/loki_instructions.jinja2 | 75 ++++++++++++++
 .../toolsets/grafana/toolset_grafana_loki.py  | 36 ++++++-
 4 files changed, 214 insertions(+), 19 deletions(-)
 create mode 100644 holmes/plugins/toolsets/grafana/loki_instructions.jinja2

diff --git a/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py b/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py
index 8c5a87b27..61118c865 100644
--- a/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py
+++ b/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py
@@ -28,6 +28,7 @@
     LoggingCapability,
     PodLoggingTool,
 )
+from holmes.core.tools import ToolParameter
 from holmes.plugins.toolsets.utils import process_timestamps_to_rfc3339
 
 
@@ -68,6 +69,31 @@ def calculate_page_size(
     return min(dd_config.page_size, max(0, max_logs_count - logs_count))
 
 
+def build_datadog_query(
+    params: FetchPodLogsParams,
+    dd_config: DatadogLogsConfig,
+) -> str:
+    """Build Datadog query string from parameters, handling wildcards properly."""
+    query_parts = []
+
+    # Only add namespace filter if not empty or single wildcard
+    if params.namespace and params.namespace != "*":
+        query_parts.append(f"{dd_config.labels.namespace}:{params.namespace}")
+
+    # Only add pod filter if not empty or single wildcard
+    if params.pod_name and params.pod_name != "*":
+        query_parts.append(f"{dd_config.labels.pod}:{params.pod_name}")
+
+    # Start with base query or use "*" if no specific filters
+    query = " ".join(query_parts) if query_parts else "*"
+
+    if params.filter:
+        filter = params.filter.replace('"', '\\"')
+        query += f' "{filter}"'
+
+    return query
+
+
 def fetch_paginated_logs(
     params: FetchPodLogsParams,
     dd_config: DatadogLogsConfig,
@@ -84,11 +110,8 @@ def fetch_paginated_logs(
     url = f"{dd_config.site_api_url}/api/v2/logs/events/search"
     headers = get_headers(dd_config)
 
-    query = f"{dd_config.labels.namespace}:{params.namespace}"
-    query += f" {dd_config.labels.pod}:{params.pod_name}"
-    if params.filter:
-        filter = params.filter.replace('"', '\\"')
-        query += f' "{filter}"'
+    # Build query using helper function
+    query = build_datadog_query(params, dd_config)
 
     payload: Dict[str, Any] = {
         "filter": {
@@ -169,12 +192,8 @@ def generate_datadog_logs_url(
     # Convert API URL to app URL using the shared helper
     base_url = convert_api_url_to_app_url(dd_config.site_api_url)
 
-    # Build the query string
-    query = f"{dd_config.labels.namespace}:{params.namespace}"
-    query += f" {dd_config.labels.pod}:{params.pod_name}"
-    if params.filter:
-        filter = params.filter.replace('"', '\\"')
-        query += f' "{filter}"'
+    # Build query using helper function
+    query = build_datadog_query(params, dd_config)
 
     # Process timestamps - get Unix timestamps in seconds
     (from_time_seconds, to_time_seconds) = process_timestamps_to_int(
@@ -204,6 +223,24 @@ def generate_datadog_logs_url(
     return f"{base_url}/logs?{urlencode(url_params)}"
 
 
+class DatadogPodLoggingTool(PodLoggingTool):
+    """Custom pod logging tool for Datadog with wildcard support"""
+
+    def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict:
+        """Override to add wildcard support to pod_name parameter"""
+        # Get base parameters from parent
+        params = super()._get_tool_parameters(toolset)
+
+        # Override pod_name description to indicate wildcard support
+        params["pod_name"] = ToolParameter(
+            description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods",
+            type="string",
+            required=True,
+        )
+
+        return params
+
+
 class DatadogLogsToolset(BasePodLoggingToolset):
     dd_config: Optional[DatadogLogsConfig] = None
 
@@ -225,7 +262,8 @@ def __init__(self):
             tags=[ToolsetTag.CORE],
         )
         # Now that parent is initialized and self.name exists, create the tool
-        self.tools = [PodLoggingTool(self)]
+        # Use our custom DatadogPodLoggingTool with wildcard support
+        self.tools = [DatadogPodLoggingTool(self)]
         self._reload_instructions()
 
     def logger_name(self) -> str:
@@ -397,6 +435,8 @@ def _perform_healthcheck(self) -> Tuple[bool, str]:
         """
         try:
             logging.debug("Performing Datadog configuration healthcheck...")
+            # Use wildcards which are now properly handled by the query builder
+            # The query builder will detect "*" and create a broad search query
             healthcheck_params = FetchPodLogsParams(
                 namespace="*",
                 pod_name="*",
@@ -404,16 +444,45 @@ def _perform_healthcheck(self) -> Tuple[bool, str]:
                 start_time="-172800",  # 48 hours in seconds
             )
 
+            # Calculate actual timestamps for debugging
+            from holmes.plugins.toolsets.utils import process_timestamps_to_int
+            from datetime import datetime
+
+            (from_time, to_time) = process_timestamps_to_int(
+                start=healthcheck_params.start_time,
+                end=healthcheck_params.end_time,
+                default_time_span_seconds=86400,
+            )
+            from_dt = datetime.fromtimestamp(from_time)
+            to_dt = datetime.fromtimestamp(to_time)
+
+            logging.info(
+                f"DEBUG: Running Datadog healthcheck with params: namespace={healthcheck_params.namespace}, pod_name={healthcheck_params.pod_name}, start_time={healthcheck_params.start_time}"
+            )
+            logging.info(
+                f"DEBUG: Healthcheck time range: from {from_dt.isoformat()} to {to_dt.isoformat()}"
+            )
+            if self.dd_config:
+                logging.info(
+                    f"DEBUG: Healthcheck query will be sent to: {self.dd_config.site_api_url}"
+                )
+
             result = self.fetch_pod_logs(healthcheck_params)
+            logging.info(
+                f"DEBUG: Healthcheck result status: {result.status}, error: {result.error}, data length: {len(result.data or '')}"
+            )
 
             if result.status == StructuredToolResultStatus.ERROR:
                 error_msg = result.error or "Unknown error during healthcheck"
                 logging.error(f"Datadog healthcheck failed: {error_msg}")
                 return False, f"Datadog healthcheck failed: {error_msg}"
             elif result.status == StructuredToolResultStatus.NO_DATA:
-                error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?"
-                logging.error(f"Datadog healthcheck failed: {error_msg}")
-                return False, f"Datadog healthcheck failed: {error_msg}"
+                # NO_DATA is acceptable for healthcheck - it just means no logs exist yet
+                # The important thing is that the API call succeeded
+                logging.info(
+                    "Datadog healthcheck completed successfully (no data found, but API is accessible)"
+                )
+                return True, ""
 
             logging.info("Datadog healthcheck completed successfully")
             return True, ""
diff --git a/holmes/plugins/toolsets/grafana/loki_api.py b/holmes/plugins/toolsets/grafana/loki_api.py
index 6ccb1479f..d1eed94bc 100644
--- a/holmes/plugins/toolsets/grafana/loki_api.py
+++ b/holmes/plugins/toolsets/grafana/loki_api.py
@@ -59,7 +59,15 @@ def execute_loki_query(
         return []
 
     except requests.exceptions.RequestException as e:
-        raise Exception(f"Failed to query Loki logs: {str(e)}")
+        error_details = (
+            f"Failed to query Loki logs:\n"
+            f"  URL: {url}\n"
+            f"  Query: {query}\n"
+            f"  Time range: {start} to {end}\n"
+            f"  Limit: {limit}\n"
+            f"  Error: {str(e)}"
+        )
+        raise Exception(error_details)
 
 
 def query_loki_logs_by_label(
@@ -75,7 +83,18 @@ def query_loki_logs_by_label(
     namespace_search_key: str = "namespace",
     limit: int = 200,
 ) -> List[Dict]:
-    query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'
+    # Handle wildcards: if label_value is "*" or contains wildcards, use regex matching
+    if label_value == "*":
+        # Match any value for this label
+        query = f'{{{namespace_search_key}="{namespace}", {label}=~".+"}}'
+    elif "*" in label_value:
+        # Convert wildcard to regex pattern (e.g., "payment-*" -> "payment-.*")
+        regex_pattern = label_value.replace("*", ".*")
+        query = f'{{{namespace_search_key}="{namespace}", {label}=~"{regex_pattern}"}}'
+    else:
+        # Exact match for the label value
+        query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}'
+
     if filter:
         query += f' |= "{filter}"'
     return execute_loki_query(
diff --git a/holmes/plugins/toolsets/grafana/loki_instructions.jinja2 b/holmes/plugins/toolsets/grafana/loki_instructions.jinja2
new file mode 100644
index 000000000..4d6b8f6c6
--- /dev/null
+++ b/holmes/plugins/toolsets/grafana/loki_instructions.jinja2
@@ -0,0 +1,75 @@
+## Loki Logs Tools Usage Guide
+
+Before running logs queries:
+
+** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
+** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
+** Find the involved resource name and kind
+** If you can't figure out what is the type of the resource, ask the user for more information and don't guess
+
+### General guideline
+- This toolset is used to read pod logs from Loki, a log aggregation system
+- Assume the pod should have logs. If logs not found, try to adjust the query
+- Loki stores historical logs, so you can query logs for pods that no longer exist
+
+### CRITICAL: Pod Name Resolution Workflow
+
+**IMPORTANT WILDCARD USAGE:**
+- **ALWAYS use wildcards** when searching for pods unless you have the COMPLETE pod name with all suffixes
+- Kubernetes pod names include deployment hash + replica ID (e.g., `nginx-ingress-7b9899-x2km9`, `frontend-5f4d3b2a1-abc123`)
+- When user says "nginx pod" or "frontend pod", search for `nginx-*` or `frontend-*` NOT just `nginx` or `frontend`
+- Loki supports wildcards: `*` matches any characters (e.g., `nginx-*`, `*ingress*`, `*-x2km9`)
+- For partial matches, use wildcards on both sides: `*keyword*` to find logs from any pod containing "keyword"
+
+**When user provides what looks like a complete pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
+- Query Loki directly with that exact pod name
+- Do NOT try to verify if the pod exists in Kubernetes first
+- This allows querying historical pods that have been deleted/replaced
+
+**When user provides a simple/generic name** (e.g., "nginx", "redis", "payment-service", "auth"):
+- **DEFAULT ACTION: Use wildcards** - Query with `pod-name-*` pattern
+- For historical queries (yesterday, last week): ALWAYS use wildcards directly in Loki
+- For current issues: Optionally use `kubectl_find_resource` to find exact pod names, but wildcards often work better
+- Examples:
+  - User says "nginx pod" → Query Loki with `nginx-*`
+  - User says "redis instance" → Query Loki with `redis-*`
+  - User says "payment service" → Query Loki with `payment-*`
+
+**Why wildcards are critical:**
+- Pod names in Loki are the actual Kubernetes pod names (with random suffixes)
+- Users typically refer to pods by their deployment/service name without suffixes
+- Without wildcards, queries for "nginx" will find NOTHING when actual pods are named "nginx-7b9899-x2km9"
+- Historical pods that no longer exist can only be found via Loki with proper wildcard usage
+
+### Time Parameters
+- Use RFC3339 format: `2023-03-01T10:30:00Z`
+- Or relative seconds: `-3600` for 1 hour ago
+- Defaults to 7 days window if not specified
+
+### Filter Usage
+- The `filter` parameter performs substring matching on log content
+- Use it to search for specific error messages, keywords, or patterns
+- Example: filter="error" will return only logs containing the word "error"
+
+### Common Investigation Patterns
+
+**For Current Pod Issues:**
+1. User asks: "Show logs for payment service"
+2. Query Loki with pod_name="payment-*" to get all pods matching that pattern
+3. Apply time ranges and filters as needed
+
+**For Historical Pod Issues:**
+1. User asks: "What happened to the payment-api pod yesterday at 2pm?"
+2. Query Loki directly with pod_name="payment-*" and appropriate time range
+3. No need to verify pod existence - Loki has the historical data
+
+**For Debugging Deleted/Restarted Pods:**
+1. Pod may no longer exist in cluster
+2. Use wildcards to find all historical logs for that workload
+3. Loki retains logs even after pods are gone
+
+### Important Notes
+- Always inform the user about the actual time period fetched
+- If a limit was applied, tell the user how many logs were shown
+- If filters were applied, mention them explicitly
+- If no logs are found, suggest broader wildcards or time ranges
diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
index 11bbbd8fd..5484dec22 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, cast, Set
 from pydantic import BaseModel
 
@@ -23,7 +24,11 @@
 from holmes.plugins.toolsets.grafana.loki_api import (
     query_loki_logs_by_label,
 )
-from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
+from holmes.core.tools import (
+    StructuredToolResult,
+    StructuredToolResultStatus,
+    ToolParameter,
+)
 
 
 class GrafanaLokiLabelsConfig(BaseModel):
@@ -35,6 +40,24 @@ class GrafanaLokiConfig(GrafanaConfig):
     labels: GrafanaLokiLabelsConfig = GrafanaLokiLabelsConfig()
 
 
+class LokiPodLoggingTool(PodLoggingTool):
+    """Custom pod logging tool for Loki with wildcard support"""
+
+    def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict:
+        """Override to add wildcard support to pod_name parameter"""
+        # Get base parameters from parent
+        params = super()._get_tool_parameters(toolset)
+
+        # Override pod_name description to indicate wildcard support
+        params["pod_name"] = ToolParameter(
+            description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods",
+            type="string",
+            required=True,
+        )
+
+        return params
+
+
 class GrafanaLokiToolset(BasePodLoggingToolset):
     @property
     def supported_capabilities(self) -> Set[LoggingCapability]:
@@ -51,7 +74,9 @@ def __init__(self):
             tools=[],  # Initialize with empty tools first
         )
         # Now that parent is initialized and self.name exists, create the tool
-        self.tools = [PodLoggingTool(self)]
+        # Use our custom LokiPodLoggingTool with wildcard support
+        self.tools = [LokiPodLoggingTool(self)]
+        self._reload_instructions()
 
     def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]:
         if not config:
@@ -109,3 +134,10 @@ def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
                 status=StructuredToolResultStatus.NO_DATA,
                 params=params.model_dump(),
             )
+
+    def _reload_instructions(self):
+        """Load Loki specific instructions."""
+        template_file_path = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), "loki_instructions.jinja2")
+        )
+        self._load_llm_instructions(jinja_template=f"file://{template_file_path}")