From 199e083aa8cb5afe648c9ad6d804524179536190 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Tue, 30 Sep 2025 07:21:04 +0300 Subject: [PATCH] WIP --- .../toolsets/datadog/toolset_datadog_logs.py | 99 ++++++++++++++++--- holmes/plugins/toolsets/grafana/loki_api.py | 23 ++++- .../toolsets/grafana/loki_instructions.jinja2 | 75 ++++++++++++++ .../toolsets/grafana/toolset_grafana_loki.py | 36 ++++++- 4 files changed, 214 insertions(+), 19 deletions(-) create mode 100644 holmes/plugins/toolsets/grafana/loki_instructions.jinja2 diff --git a/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py b/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py index 8c5a87b27..61118c865 100644 --- a/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +++ b/holmes/plugins/toolsets/datadog/toolset_datadog_logs.py @@ -28,6 +28,7 @@ LoggingCapability, PodLoggingTool, ) +from holmes.core.tools import ToolParameter from holmes.plugins.toolsets.utils import process_timestamps_to_rfc3339 @@ -68,6 +69,31 @@ def calculate_page_size( return min(dd_config.page_size, max(0, max_logs_count - logs_count)) +def build_datadog_query( + params: FetchPodLogsParams, + dd_config: DatadogLogsConfig, +) -> str: + """Build Datadog query string from parameters, handling wildcards properly.""" + query_parts = [] + + # Only add namespace filter if not empty or single wildcard + if params.namespace and params.namespace != "*": + query_parts.append(f"{dd_config.labels.namespace}:{params.namespace}") + + # Only add pod filter if not empty or single wildcard + if params.pod_name and params.pod_name != "*": + query_parts.append(f"{dd_config.labels.pod}:{params.pod_name}") + + # Start with base query or use "*" if no specific filters + query = " ".join(query_parts) if query_parts else "*" + + if params.filter: + filter = params.filter.replace('"', '\\"') + query += f' "{filter}"' + + return query + + def fetch_paginated_logs( params: FetchPodLogsParams, dd_config: DatadogLogsConfig, @@ -84,11 +110,8 @@ def fetch_paginated_logs( url = f"{dd_config.site_api_url}/api/v2/logs/events/search" headers = get_headers(dd_config) - query = f"{dd_config.labels.namespace}:{params.namespace}" - query += f" {dd_config.labels.pod}:{params.pod_name}" - if params.filter: - filter = params.filter.replace('"', '\\"') - query += f' "{filter}"' + # Build query using helper function + query = build_datadog_query(params, dd_config) payload: Dict[str, Any] = { "filter": { @@ -169,12 +192,8 @@ def generate_datadog_logs_url( # Convert API URL to app URL using the shared helper base_url = convert_api_url_to_app_url(dd_config.site_api_url) - # Build the query string - query = f"{dd_config.labels.namespace}:{params.namespace}" - query += f" {dd_config.labels.pod}:{params.pod_name}" - if params.filter: - filter = params.filter.replace('"', '\\"') - query += f' "{filter}"' + # Build query using helper function + query = build_datadog_query(params, dd_config) # Process timestamps - get Unix timestamps in seconds (from_time_seconds, to_time_seconds) = process_timestamps_to_int( @@ -204,6 +223,24 @@ def generate_datadog_logs_url( return f"{base_url}/logs?{urlencode(url_params)}" +class DatadogPodLoggingTool(PodLoggingTool): + """Custom pod logging tool for Datadog with wildcard support""" + + def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict: + """Override to add wildcard support to pod_name parameter""" + # Get base parameters from parent + params = super()._get_tool_parameters(toolset) + + # Override pod_name description to indicate wildcard support + params["pod_name"] = ToolParameter( + description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods", + type="string", + required=True, + ) + + return params + + class DatadogLogsToolset(BasePodLoggingToolset): dd_config: Optional[DatadogLogsConfig] = None @@ -225,7 +262,8 @@ def __init__(self): tags=[ToolsetTag.CORE], ) # Now that parent is initialized and self.name exists, create the tool - self.tools = [PodLoggingTool(self)] + # Use our custom DatadogPodLoggingTool with wildcard support + self.tools = [DatadogPodLoggingTool(self)] self._reload_instructions() def logger_name(self) -> str: @@ -397,6 +435,8 @@ def _perform_healthcheck(self) -> Tuple[bool, str]: """ try: logging.debug("Performing Datadog configuration healthcheck...") + # Use wildcards which are now properly handled by the query builder + # The query builder will detect "*" and create a broad search query healthcheck_params = FetchPodLogsParams( namespace="*", pod_name="*", @@ -404,16 +444,45 @@ def _perform_healthcheck(self) -> Tuple[bool, str]: start_time="-172800", # 48 hours in seconds ) + # Calculate actual timestamps for debugging + from holmes.plugins.toolsets.utils import process_timestamps_to_int + from datetime import datetime + + (from_time, to_time) = process_timestamps_to_int( + start=healthcheck_params.start_time, + end=healthcheck_params.end_time, + default_time_span_seconds=86400, + ) + from_dt = datetime.fromtimestamp(from_time) + to_dt = datetime.fromtimestamp(to_time) + + logging.info( + f"DEBUG: Running Datadog healthcheck with params: namespace={healthcheck_params.namespace}, pod_name={healthcheck_params.pod_name}, start_time={healthcheck_params.start_time}" + ) + logging.info( + f"DEBUG: Healthcheck time range: from {from_dt.isoformat()} to {to_dt.isoformat()}" + ) + if self.dd_config: + logging.info( + f"DEBUG: Healthcheck query will be sent to: {self.dd_config.site_api_url}" + ) + result = self.fetch_pod_logs(healthcheck_params) + logging.info( + f"DEBUG: Healthcheck result status: {result.status}, error: {result.error}, data length: {len(result.data or '')}" + ) if result.status == StructuredToolResultStatus.ERROR: error_msg = result.error or "Unknown error during healthcheck" logging.error(f"Datadog healthcheck failed: {error_msg}") return False, f"Datadog healthcheck failed: {error_msg}" elif result.status == StructuredToolResultStatus.NO_DATA: - error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?" - logging.error(f"Datadog healthcheck failed: {error_msg}") - return False, f"Datadog healthcheck failed: {error_msg}" + # NO_DATA is acceptable for healthcheck - it just means no logs exist yet + # The important thing is that the API call succeeded + logging.info( + "Datadog healthcheck completed successfully (no data found, but API is accessible)" + ) + return True, "" logging.info("Datadog healthcheck completed successfully") return True, "" diff --git a/holmes/plugins/toolsets/grafana/loki_api.py b/holmes/plugins/toolsets/grafana/loki_api.py index 6ccb1479f..d1eed94bc 100644 --- a/holmes/plugins/toolsets/grafana/loki_api.py +++ b/holmes/plugins/toolsets/grafana/loki_api.py @@ -59,7 +59,15 @@ def execute_loki_query( return [] except requests.exceptions.RequestException as e: - raise Exception(f"Failed to query Loki logs: {str(e)}") + error_details = ( + f"Failed to query Loki logs:\n" + f" URL: {url}\n" + f" Query: {query}\n" + f" Time range: {start} to {end}\n" + f" Limit: {limit}\n" + f" Error: {str(e)}" + ) + raise Exception(error_details) def query_loki_logs_by_label( @@ -75,7 +83,18 @@ def query_loki_logs_by_label( namespace_search_key: str = "namespace", limit: int = 200, ) -> List[Dict]: - query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}' + # Handle wildcards: if label_value is "*" or contains wildcards, use regex matching + if label_value == "*": + # Match any value for this label + query = f'{{{namespace_search_key}="{namespace}", {label}=~".+"}}' + elif "*" in label_value: + # Convert wildcard to regex pattern (e.g., "payment-*" -> "payment-.*") + regex_pattern = label_value.replace("*", ".*") + query = f'{{{namespace_search_key}="{namespace}", {label}=~"{regex_pattern}"}}' + else: + # Exact match for the label value + query = f'{{{namespace_search_key}="{namespace}", {label}="{label_value}"}}' + if filter: query += f' |= "{filter}"' return execute_loki_query( diff --git a/holmes/plugins/toolsets/grafana/loki_instructions.jinja2 b/holmes/plugins/toolsets/grafana/loki_instructions.jinja2 new file mode 100644 index 000000000..4d6b8f6c6 --- /dev/null +++ b/holmes/plugins/toolsets/grafana/loki_instructions.jinja2 @@ -0,0 +1,75 @@ +## Loki Logs Tools Usage Guide + +Before running logs queries: + +** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type. +** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about! +** Find the involved resource name and kind +** If you can't figure out what is the type of the resource, ask the user for more information and don't guess + +### General guideline +- This toolset is used to read pod logs from Loki, a log aggregation system +- Assume the pod should have logs. If logs not found, try to adjust the query +- Loki stores historical logs, so you can query logs for pods that no longer exist + +### CRITICAL: Pod Name Resolution Workflow + +**IMPORTANT WILDCARD USAGE:** +- **ALWAYS use wildcards** when searching for pods unless you have the COMPLETE pod name with all suffixes +- Kubernetes pod names include deployment hash + replica ID (e.g., `nginx-ingress-7b9899-x2km9`, `frontend-5f4d3b2a1-abc123`) +- When user says "nginx pod" or "frontend pod", search for `nginx-*` or `frontend-*` NOT just `nginx` or `frontend` +- Loki supports wildcards: `*` matches any characters (e.g., `nginx-*`, `*ingress*`, `*-x2km9`) +- For partial matches, use wildcards on both sides: `*keyword*` to find logs from any pod containing "keyword" + +**When user provides what looks like a complete pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`): +- Query Loki directly with that exact pod name +- Do NOT try to verify if the pod exists in Kubernetes first +- This allows querying historical pods that have been deleted/replaced + +**When user provides a simple/generic name** (e.g., "nginx", "redis", "payment-service", "auth"): +- **DEFAULT ACTION: Use wildcards** - Query with `pod-name-*` pattern +- For historical queries (yesterday, last week): ALWAYS use wildcards directly in Loki +- For current issues: Optionally use `kubectl_find_resource` to find exact pod names, but wildcards often work better +- Examples: + - User says "nginx pod" → Query Loki with `nginx-*` + - User says "redis instance" → Query Loki with `redis-*` + - User says "payment service" → Query Loki with `payment-*` + +**Why wildcards are critical:** +- Pod names in Loki are the actual Kubernetes pod names (with random suffixes) +- Users typically refer to pods by their deployment/service name without suffixes +- Without wildcards, queries for "nginx" will find NOTHING when actual pods are named "nginx-7b9899-x2km9" +- Historical pods that no longer exist can only be found via Loki with proper wildcard usage + +### Time Parameters +- Use RFC3339 format: `2023-03-01T10:30:00Z` +- Or relative seconds: `-3600` for 1 hour ago +- Defaults to 7 days window if not specified + +### Filter Usage +- The `filter` parameter performs substring matching on log content +- Use it to search for specific error messages, keywords, or patterns +- Example: filter="error" will return only logs containing the word "error" + +### Common Investigation Patterns + +**For Current Pod Issues:** +1. User asks: "Show logs for payment service" +2. Query Loki with pod_name="payment-*" to get all pods matching that pattern +3. Apply time ranges and filters as needed + +**For Historical Pod Issues:** +1. User asks: "What happened to the payment-api pod yesterday at 2pm?" +2. Query Loki directly with pod_name="payment-*" and appropriate time range +3. No need to verify pod existence - Loki has the historical data + +**For Debugging Deleted/Restarted Pods:** +1. Pod may no longer exist in cluster +2. Use wildcards to find all historical logs for that workload +3. Loki retains logs even after pods are gone + +### Important Notes +- Always inform the user about the actual time period fetched +- If a limit was applied, tell the user how many logs were shown +- If filters were applied, mention them explicitly +- If no logs are found, suggest broader wildcards or time ranges diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py index 11bbbd8fd..5484dec22 100644 --- a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +++ b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py @@ -1,3 +1,4 @@ +import os from typing import Any, cast, Set from pydantic import BaseModel @@ -23,7 +24,11 @@ from holmes.plugins.toolsets.grafana.loki_api import ( query_loki_logs_by_label, ) -from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus +from holmes.core.tools import ( + StructuredToolResult, + StructuredToolResultStatus, + ToolParameter, +) class GrafanaLokiLabelsConfig(BaseModel): @@ -35,6 +40,24 @@ class GrafanaLokiConfig(GrafanaConfig): labels: GrafanaLokiLabelsConfig = GrafanaLokiLabelsConfig() +class LokiPodLoggingTool(PodLoggingTool): + """Custom pod logging tool for Loki with wildcard support""" + + def _get_tool_parameters(self, toolset: BasePodLoggingToolset) -> dict: + """Override to add wildcard support to pod_name parameter""" + # Get base parameters from parent + params = super()._get_tool_parameters(toolset) + + # Override pod_name description to indicate wildcard support + params["pod_name"] = ToolParameter( + description="The kubernetes pod name. Use '*' to fetch logs from all pods in the namespace, or use wildcards like 'payment-*' to match multiple pods", + type="string", + required=True, + ) + + return params + + class GrafanaLokiToolset(BasePodLoggingToolset): @property def supported_capabilities(self) -> Set[LoggingCapability]: @@ -51,7 +74,9 @@ def __init__(self): tools=[], # Initialize with empty tools first ) # Now that parent is initialized and self.name exists, create the tool - self.tools = [PodLoggingTool(self)] + # Use our custom LokiPodLoggingTool with wildcard support + self.tools = [LokiPodLoggingTool(self)] + self._reload_instructions() def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]: if not config: @@ -109,3 +134,10 @@ def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult: status=StructuredToolResultStatus.NO_DATA, params=params.model_dump(), ) + + def _reload_instructions(self): + """Load Loki specific instructions.""" + template_file_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "loki_instructions.jinja2") + ) + self._load_llm_instructions(jinja_template=f"file://{template_file_path}")