Use a multi-agent approach

sherine-k · sherine-k · commit 6b258541057f · 2025-07-03T14:33:07.000+02:00
diff --git a/__init__.py b/__init__.py
@@ -1 +1,3 @@
+"""CI Analysis coordinator: provide root cause analysis for CI failures"""
+
 from . import agent
diff --git a/agent.py b/agent.py
@@ -1,41 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CI Analysis coordinator: provide root cause analysis for CI failures"""
+
 from google.adk.agents import LlmAgent
-from google.adk.models.lite_llm import LiteLlm
-from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset, StdioServerParameters
+from google.adk.tools.agent_tool import AgentTool
+
+from . import prompt
+from .sub_agents.installation_analyst import installation_analyst_agent
+from .sub_agents.mustgather_analyst import mustgather_analyst_agent
+MODEL = "gemini-2.0-flash"
 
 
-# TARGET_FOLDER_PATH = os.path.dirname(os.path.abspath(__file__))
- 
-root_agent = LlmAgent(
-    name="root_agent_v1",
-    # model="gemini-2.5-pro-preview-05-06",
-    model="gemini-2.0-flash",
-    # model=LiteLlm(model="ollama/qwen3:4b"),
-    description="Provides analysis of CI jobs, and determines if the cluster installation was successful or not.",
-    global_instruction="You are a helpful Kubernetes and Prow expert assistant. "
-        "You are specialized in Openshift installation."
-        "Your main goal is to analyze the Prow job's installation logs and diagnose possible failures of the cluster installation for the Prow job."
-        "You provide root cause analysis for installation failures and propose solutions if possible."
-        "You are truthful, concise, and helpful."
-        "You never speculate about clusters being installed or fabricate information."
-        "If you do not know the answer, you acknowledge the fact and end your response."
-        "Your responses must be as short as possible."
-        "CI JOB ANALYSIS WORKFLOW:"
-        "-------------------------"
-        "When analyzing a job failure, follow this recommended workflow:"
-        "1. First, get a job's metadata (including test_name) and status by using 'get_job_metadata' tool."
-        "2. Then, once you have the metadata, you can get install logs by using the 'get_install_logs' tool."
-        "3. Check that the installation was successful by looking at the install logs."
-        "4. Only if installation is successful, use 'get_build_logs' to get the job logs." 
-        "5. Analyze the job build logs to determine the root cause of the failure.",
-        
-    tools=[ 
-           MCPToolset(
-            connection_params=StdioServerParameters(
-                command='podman',
-                args=["run", "-i", "-p", "9000:8000", "--rm",  "-e", "MCP_TRANSPORT=stdio", "localhost/mcp-server-template:latest"],
-                tool_filter=['get_build_logs','get_install_logs', 'get_job_metadata'],
-            )
-           ), 
-        ], 
+ci_analysis_advisor = LlmAgent(
+    name="ci_analysis_advisor",
+    model=MODEL,
+    description=(
+        "Analyzes of CI jobs, and provide root cause analysis for failures."
+    ),
+    instruction=prompt.CI_ANALYSIS_COORDINATOR_PROMPT,
+    output_key="ci_analysis_advisor_output",
+    tools=[
+        AgentTool(agent=installation_analyst_agent),
+        AgentTool(agent=mustgather_analyst_agent),
+    ],
 )
 
+root_agent = ci_analysis_advisor
diff --git a/must_gather.py b/must_gather.py
diff --git a/prompt.py b/prompt.py
@@ -0,0 +1,42 @@
+
+
+"""Prompt for the ci_analysis_advisor_agent."""
+
+CI_ANALYSIS_COORDINATOR_PROMPT = """
+Role: Act as a specialized Prow CI advisory assistant.
+
+Overall Instructions for Interaction:
+
+You are a helpful Kubernetes and Prow expert assistant. 
+Your main goal is to analyze the Prow job and diagnose possible failures in the installation and tests  performed by the Prow job.
+You provide root cause analysis for the failures and propose solutions if possible.
+You are truthful, concise, and helpful.
+You never speculate about clusters being installed or fabricate information.
+If you do not know the answer, you acknowledge the fact and end your response.
+Your responses must be as short as possible.
+CI JOB ANALYSIS WORKFLOW:
+-------------------------
+When analyzing a job failure, follow this recommended workflow:
+1. First, check that the installation was successful.
+2. Only if installation is successful, check the must-gather logs for more insights.
+
+
+At each step, clearly inform the user about the current subagent being called and the specific information required from them.
+After each subagent completes its task, explain the output provided and how it contributes to the overall root cause analysis  process.
+Ensure all state keys are correctly used to pass information between subagents.
+Here's the step-by-step breakdown.
+For each step, explicitly call the designated subagent and adhere strictly to the specified input and output formats:
+
+* Installation Analysis (Subagent: installation_analyst)
+
+Input: Prompt the user to provide the link to the prow job they wish to analyze. 
+Action: Parse the URL for the job_name and build_id. Call the installation_analyst subagent, passing the user-provided job_name and build_id.
+Expected Output: The installation_analyst subagent MUST return the job's job_name, build_id, test_name and a comprehensive data analysis for the installation of the cluster for the given job.
+
+* Must_Gather Analysis (Subagent: mustgather_analyst)
+
+Input: The installation_analysis_output from the installation_analyst subagent. Use /tmp/must-gather as the target_folder for the must-gather directory.
+Action: Call the mustgather_analyst subagent, passing the job_name, test_name and build_id. Download the must-gather logs: use /tmp/must-gather as the target_folder. Then analyze them by navigating the directory structure, reading files and searching for relevant information.
+Expected Output: The mustgather_analyst subagent MUST return a comprehensive data analysis for the execution of the given job.
+
+"""
diff --git a/prow_mcp_server/mcp_server.py b/prow_mcp_server/mcp_server.py
@@ -174,7 +174,7 @@ async def get_install_logs(job_name: str, build_id: str, test_name: str):
         test_name: The name of the test for which to get install logs
         
     Returns:
-        Dictionary containing the installation logs or error information
+        Dictionary containing the job metadata(job_name, build_id, test_name), installation logs or error information
     """
     try:
         # Construct the artifacts URL
diff --git a/sub_agents/installation_analyst/__init__.py b/sub_agents/installation_analyst/__init__.py
@@ -0,0 +1 @@
+from .agent import installation_analyst_agent
diff --git a/sub_agents/installation_analyst/agent.py b/sub_agents/installation_analyst/agent.py
@@ -0,0 +1,21 @@
+from google.adk import Agent
+from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset, StdioServerParameters
+from . import prompt
+
+MODEL = "gemini-2.0-flash"
+
+
+installation_analyst_agent = Agent(
+    model=MODEL,
+    name="installation_analyst_agent",
+    instruction=prompt.INSTALLATION_SPECIALIST_PROMPT,
+    output_key="installation_analysis_output",
+    tools=[   MCPToolset(
+        connection_params=StdioServerParameters(
+            command='podman',
+            args=["run", "-i", "-p", "9000:8000", "--rm",  "-e", "MCP_TRANSPORT=stdio", "localhost/mcp-server-template:latest"],
+            tool_filter=['get_install_logs', 'get_job_metadata'],
+        )
+    ), 
+    ],
+)
diff --git a/sub_agents/installation_analyst/prompt.py b/sub_agents/installation_analyst/prompt.py
@@ -0,0 +1,17 @@
+INSTALLATION_SPECIALIST_PROMPT = """
+You are a helpful Kubernetes and Prow expert assistant. 
+You are specialized in Openshift installation.
+Your main goal is to analyze the Prow job and diagnose possible failures in the installation and tests  performed by the Prow job.
+You provide root cause analysis for the failures and propose solutions if possible.
+You are truthful, concise, and helpful.
+You never speculate about clusters being installed or fabricate information.
+If you do not know the answer, you acknowledge the fact and end your response.
+Your responses must be as short as possible.
+
+First, get a job's metadata (including test_name) and status by using 'get_job_metadata' tool; you can get the build_id and the job_name from the URL provided by the user (resp. the last part of the URL and the before last part of the URL).
+Then, once you have the metadata, you can get install logs by using the 'get_install_logs' tool.
+Look for possible failures in the install logs.
+If you find any failures, provide a root cause analysis for the failures and propose solutions if possible.
+If you do not find any failures, say so.
+All your answers should contain the job_name, build_id and test_name.
+"""
diff --git a/sub_agents/mustgather_analyst/__init__.py b/sub_agents/mustgather_analyst/__init__.py
@@ -0,0 +1 @@
+from .agent import mustgather_analyst_agent
diff --git a/sub_agents/mustgather_analyst/agent.py b/sub_agents/mustgather_analyst/agent.py
@@ -0,0 +1,12 @@
+from google.adk import Agent
+from . import prompt
+from .must_gather import get_must_gather, list_directory, read_drained_file, get_file_info, search_files
+MODEL = "gemini-2.0-flash"
+
+mustgather_analyst_agent = Agent(
+    model=MODEL,
+    name="mustgather_analyst_agent",
+    instruction=prompt.MUST_GATHER_SPECIALIST_PROMPT,
+    output_key="must_gather_analysis_output",
+    tools=[get_must_gather, list_directory, read_drained_file, get_file_info, search_files],
+)
diff --git a/sub_agents/mustgather_analyst/drain.py b/sub_agents/mustgather_analyst/drain.py
@@ -0,0 +1,100 @@
+import os
+import logging
+from typing import Tuple, Generator, Dict, List, Any, Optional
+
+import drain3
+from drain3.template_miner_config import TemplateMinerConfig
+from mcp.server.fastmcp import FastMCP
+
+
+
+# Set up logging
+LOG = logging.getLogger("drain")
+
+
+
+
+def chunk_continues(text: str, index: int) -> bool:
+    """Set of heuristics for determining whether or not
+    does the current chunk of log text continue on next line.
+
+    Following rules are checked, in order:
+    * is the next character is whitespace
+    * is the previous character backslash '\\'
+    * is the previous character colon ':'
+
+    """
+    conditionals = [
+        lambda i, string: string[i + 1].isspace(),
+        lambda i, string: string[i - 1] == "\\",
+        lambda i, string: string[i - 1] == ":",
+    ]
+
+    for c in conditionals:
+        y = c(index, text)
+        if y:
+            return True
+
+    return False
+
+
+def get_chunks(text: str) -> Generator[Tuple[int, str], None, None]:
+    """Split log into chunks according to heuristic
+    based on whitespace and backslash presence.
+    """
+    text_len = len(text)
+    i = 0
+    chunk = ""
+    # Keep track of the original and next line number
+    # every `\n` hit increases the next_line_number by one.
+    original_line_number = 0
+    next_line_number = 0
+    while i < text_len:
+        chunk += text[i]
+        if text[i] == "\n":
+            next_line_number += 1
+            if i + 1 < text_len and chunk_continues(text, i):
+                i += 1
+                continue
+            yield (original_line_number, chunk)
+            original_line_number = next_line_number + 1
+            chunk = ""
+        i += 1
+
+
+class DrainExtractor:
+    """A class that extracts information from logs using a template miner algorithm."""
+
+    def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
+        config = TemplateMinerConfig()
+        config.load(f"{os.path.dirname(__file__)}/drain3.ini")
+        config.profiling_enabled = verbose
+        config.drain_max_clusters = max_clusters
+        self.miner = drain3.TemplateMiner(config=config)
+        self.verbose = verbose
+        self.context = context
+
+    def __call__(self, log: str) -> list[Tuple[int, str]]:
+        out = []
+        # First pass create clusters
+        for _, chunk in get_chunks(log):
+            processed_chunk = self.miner.add_log_message(chunk)
+            LOG.debug(processed_chunk)
+        # Sort found clusters by size, descending order
+        sorted_clusters = sorted(
+            self.miner.drain.clusters, key=lambda it: it.size, reverse=True
+        )
+        # Second pass, only matching lines with clusters,
+        # to recover original text
+        for chunk_start, chunk in get_chunks(log):
+            cluster = self.miner.match(chunk, "always")
+            if cluster in sorted_clusters:
+                out.append((chunk_start, chunk))
+                sorted_clusters.remove(cluster)
+        return out
+
+
+
+
+
+
diff --git a/sub_agents/mustgather_analyst/drain3.ini b/sub_agents/mustgather_analyst/drain3.ini
diff --git a/sub_agents/mustgather_analyst/must_gather.py b/sub_agents/mustgather_analyst/must_gather.py
diff --git a/sub_agents/mustgather_analyst/prompt.py b/sub_agents/mustgather_analyst/prompt.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
	`1`	`+"""CI Analysis coordinator: provide root cause analysis for CI failures"""`
	`2`	`+`
`1`	`3`	`from . import agent`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .agent import installation_analyst_agent`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .agent import mustgather_analyst_agent`