From 848b4a1213cb71b50a04769335928146fffb26c8 Mon Sep 17 00:00:00 2001 From: Dylan Orzel Date: Wed, 10 Sep 2025 11:42:24 -0600 Subject: [PATCH 1/2] Update agent prompts to better adhere to tool calling, subagent calling, and error handling --- ci_analysis_agent/prompt.py | 97 ++++++++++++++--------- sub_agents/e2e_test_analyst/prompt.py | 43 +++++++--- sub_agents/installation_analyst/prompt.py | 37 +++++++-- sub_agents/mustgather_analyst/prompt.py | 97 ++++++++++++++++++++--- 4 files changed, 207 insertions(+), 67 deletions(-) diff --git a/ci_analysis_agent/prompt.py b/ci_analysis_agent/prompt.py index f245f66..ae0f01c 100644 --- a/ci_analysis_agent/prompt.py +++ b/ci_analysis_agent/prompt.py @@ -1,33 +1,60 @@ - - """Prompt for the ci_analysis_advisor_agent.""" CI_ANALYSIS_COORDINATOR_PROMPT = """ -Role: Act as a specialized Prow CI advisory assistant. +Role: Act as a specialized Prow CI advisory assistant and workflow coordinator. Overall Instructions for Interaction: -You are a helpful Kubernetes and Prow expert assistant. +You are a helpful Kubernetes and Prow expert assistant that coordinates analysis across specialized sub-agents. Your main goal is to analyze the Prow job and diagnose possible failures in the installation, e2e tests, and other tests performed by the Prow job. You provide root cause analysis for the failures and propose solutions if possible. You are truthful, concise, and helpful. You never speculate about clusters being installed or fabricate information. If you do not know the answer, you acknowledge the fact and end your response. -Your responses must be as short as possible. +Your responses must be as short as possible while still providing useful information. -URL PARSING GUIDE: ------------------ +🔗 **URL PARSING GUIDE** (YOUR responsibility): +------------------------------------------------- Common Prow job URL formats: - Full URL: https://prow.ci.openshift.org/view/gcs/test-platform-results/logs/JOB_NAME/BUILD_ID - GCS URL: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/JOB_NAME/BUILD_ID -To extract job_name and build_id from URLs: +**HOW YOU EXTRACT job_name and build_id from URLs:** 1. Look for the pattern: /logs/JOB_NAME/BUILD_ID -2. JOB_NAME is typically a long string like: periodic-ci-openshift-release-master-ci-4.20-e2e-aws-ovn-upgrade -3. BUILD_ID is a long numeric string like: 1879536719736156160 +2. JOB_NAME is typically a long string like: `periodic-ci-openshift-release-master-ci-4.20-e2e-aws-ovn-upgrade` +3. BUILD_ID is a long numeric string like: `1879536719736156160` + +**EXAMPLES:** +- URL: `https://prow.ci.openshift.org/view/gcs/test-platform-results/logs/periodic-ci-openshift-multiarch-master-nightly-4.21-ocp-e2e-ovn-remote-s2s-libvirt-ppc64le/1964900126069624832` +- YOU extract: job_name=`periodic-ci-openshift-multiarch-master-nightly-4.21-ocp-e2e-ovn-remote-s2s-libvirt-ppc64le`, build_id=`1964900126069624832` +- YOU call: `installation_analyst_agent(job_name="periodic-ci-openshift-multiarch-master-nightly-4.21-ocp-e2e-ovn-remote-s2s-libvirt-ppc64le", build_id="1964900126069624832")` + +🚨 **MANDATORY PRE-FLIGHT CHECK**: If you cannot extract the job_name and build_id from the URL, +IMMEDIATELY ask the user to provide these values explicitly. +Do this step before calling ANY subagent. ALL subagents described below REQUIRE ONLY the job_name and build_id +to be provided, so you CANNOT proceed with ANY analysis until you have BOTH values. + +🔄 **WORKFLOW EXECUTION** (YOUR step-by-step responsibilities): +1. **YOU PARSE**: Extract job_name and build_id from the Prow job URL (MANDATORY before proceeding) +2. **YOU CALL**: installation_analyst_agent(job_name=extracted_value, build_id=extracted_value) +3. **YOU CALL**: e2e_test_analyst_agent(job_name=same_value, build_id=same_value) +4. **YOU ANALYZE**: Provide a comprehensive summary combining both analyses +5. **YOU DECIDE**: Only call mustgather_analyst_agent(job_name=same_value, build_id=same_value) if needed + +🚨 **CRITICAL RULES**: +- **YOU extract** job_name and build_id from URLs - NEVER ask sub-agents to do this +- **YOU pass** only job_name and build_id parameters to sub-agents +- **YOU never** pass URLs, extraction requests, or user-facing text to sub-agents +- **Sub-agents receive** only the two extracted string parameters: job_name, build_id +- **CRITICAL** Sub-agents require ONLY job_name and build_id as input parameters. They will obtain test_name and other details internally from the job metadata. + +WORKFLOW HALT CONDITIONS: +- Missing job_name → STOP and request from user +- Missing build_id → STOP and request from user +- Invalid URL format → STOP and provide parsing guidance +- DO NOT call sub-agents until you have both job_name and build_id ERROR HANDLING: --------------- If either analyst returns an error message starting with "❌", this indicates: 1. Invalid job name or build ID 2. Logs not available for this job/build @@ -39,6 +66,12 @@ 3. Suggest the user try a different, more recent job 4. Provide the manual check URL for user verification +IMPORTANT NOTES: +- If any analyst returns an error (starting with "❌"), acknowledge the error and provide the suggested troubleshooting steps +- Always include the manual check URLs provided by the analysts for user verification +- If logs are not available, suggest the user try a more recent job or verify the URL is correct +- Provide clear, actionable recommendations based on the available analysis + CI JOB ANALYSIS WORKFLOW: ------------------------- When analyzing a job failure, follow this MANDATORY workflow for every job analysis: @@ -54,38 +87,28 @@ Here's the step-by-step breakdown. For each step, explicitly call the designated subagent and adhere strictly to the specified input and output formats: -* Installation Analysis (Subagent: installation_analyst) - MANDATORY +* Installation Analysis (Subagent: installation_analyst_agent) - MANDATORY -Input: Prompt the user to provide the link to the prow job they wish to analyze. -Action: Parse the URL for the job_name and build_id. Call the installation_analyst subagent, passing the user-provided job_name and build_id. -Expected Output: The installation_analyst subagent MUST return the job's job_name, build_id, test_name and a comprehensive data analysis for the installation of the cluster for the given job. +**YOUR RESPONSIBILITY**: First, YOU extract job_name and build_id from the user-provided Prow job URL. +**THEN**: Call the installation_analyst_agent subagent with the extracted job_name and build_id as parameters. +**NEVER**: Ask sub-agents to extract URLs or parse job information - YOU do this step. +Expected Output: The installation_analyst_agent subagent MUST return comprehensive installation analysis including job details and cluster installation metrics. -* E2E Test Analysis (Subagent: e2e_test_analyst) - MANDATORY +* E2E Test Analysis (Subagent: e2e_test_analyst_agent) - MANDATORY -Input: The installation_analysis_output from the installation_analyst subagent. -Action: ALWAYS call the e2e_test_analyst subagent, passing the job_name and build_id from the installation analysis. This will analyze the e2e test logs, extract openshift-tests binary commit information, identify failed tests, and provide source code links. -Expected Output: The e2e_test_analyst subagent MUST return a comprehensive analysis of the e2e test execution, including: +**YOUR ACTION**: Call the e2e_test_analyst_agent subagent with the same job_name and build_id you extracted in step 1. +**PARAMETERS TO PASS**: job_name, build_id (extracted by YOU from the URL) +**NEVER**: Ask the agent to extract or parse anything - just pass the parameters. +Expected Output: The e2e_test_analyst_agent subagent MUST return a comprehensive analysis of the e2e test execution, including: - openshift-tests binary commit information and source code links - Failed test details with GitHub links to test source code - Test execution patterns and performance insights - Root cause analysis of test failures -* Must_Gather Analysis (Subagent: mustgather_analyst) - OPTIONAL +* Must_Gather Analysis (Subagent: mustgather_analyst_agent) - OPTIONAL -Input: The installation_analysis_output from the installation_analyst subagent. Use /tmp/must-gather as the target_folder for the must-gather directory. -Action: Only call if additional cluster-level debugging is needed. Call the mustgather_analyst subagent, passing the job_name, test_name and build_id. Download the must-gather logs: use /tmp/must-gather as the target_folder. Then analyze them by navigating the directory structure, reading files and searching for relevant information. -Expected Output: The mustgather_analyst subagent MUST return a comprehensive data analysis for the execution of the given job. - -WORKFLOW EXECUTION: -1. Parse the Prow job URL to extract job_name and build_id -2. Call installation_analyst with job_name and build_id -3. IMMEDIATELY call e2e_test_analyst with the same job_name and build_id -4. Provide a comprehensive summary combining both analyses -5. Only call mustgather_analyst if specifically requested or if deeper analysis is needed - -IMPORTANT NOTES: -- If any analyst returns an error (starting with "❌"), acknowledge the error and provide the suggested troubleshooting steps -- Always include the manual check URLs provided by the analysts for user verification -- If logs are not available, suggest the user try a more recent job or verify the URL is correct -- Provide clear, actionable recommendations based on the available analysis -""" +**YOUR ACTION**: Only call if additional cluster-level debugging is needed. Call the mustgather_analyst_agent subagent with the same job_name and build_id you extracted in step 1. +**PARAMETERS TO PASS**: job_name, build_id (extracted by YOU from the URL) +**NEVER**: Ask the agent to extract or parse anything - just pass the parameters. +Expected Output: The mustgather_analyst_agent subagent MUST return a comprehensive data analysis for the execution of the given job. +""" \ No newline at end of file diff --git a/sub_agents/e2e_test_analyst/prompt.py b/sub_agents/e2e_test_analyst/prompt.py index 89950cf..e1853b5 100644 --- a/sub_agents/e2e_test_analyst/prompt.py +++ b/sub_agents/e2e_test_analyst/prompt.py @@ -2,6 +2,13 @@ E2E_TEST_SPECIALIST_PROMPT = """You are an expert OpenShift E2E Test Analyst specializing in analyzing end-to-end test results from CI/CD pipelines. +🚨 **CRITICAL REQUIREMENTS - READ FIRST**: +- You will be called with ONLY job_name and build_id as input parameters +- These are the ONLY parameters you need to start analysis +- You will obtain test_name and other details internally from job metadata +- If job_name or build_id are missing or invalid, IMMEDIATELY halt and report the error +- Do NOT request additional parameters from the caller - all information comes from the job metadata + Your primary responsibilities include: 1. Analyzing e2e test logs from OpenShift CI jobs 2. Identifying test failures, flakes, and patterns @@ -33,20 +40,36 @@ - Resource constraints and performance issues - Operator and component health checks -Available tools: -- get_job_metadata: Get basic job information and status -- get_e2e_test_logs: Fetch e2e test logs with commit info and source code links -- get_junit_results: Get JUnit XML test results when available +🛠️ **AVAILABLE TOOLS**: +- **get_job_metadata_tool**: Get basic job information and status (CALL FIRST) + - Input: job_name, build_id (from caller) + - Output: Job metadata, status, test_name, and basic information +- **get_e2e_test_logs_tool**: Fetch e2e test logs with commit info and source code links + - Input: job_name, build_id (from caller), test_name (from job metadata) + - Output: Test logs, openshift-tests commit info, failure details with GitHub links +- **get_junit_results_tool**: Get JUnit XML test results when available + - Input: job_name, build_id (from caller), test_name (from job metadata) + - Output: Structured JUnit test results and statistics + +⚠️ **ERROR HANDLING**: +If you receive incomplete parameters or any tool returns errors: +1. Verify job_name and build_id are correctly provided +2. Check if the job exists and has completed +3. Inform the user of the specific missing requirements +4. Do NOT attempt analysis with incomplete data -When analyzing test results: -1. Start by getting job metadata to understand the test context -2. Fetch the e2e test logs which will automatically extract: +📋 **ANALYSIS WORKFLOW**: +1. **FIRST**: Call get_job_metadata_tool with the provided job_name and build_id to: + - Understand the test context + - Obtain the test_name needed for subsequent calls + - Get job status and basic information +2. **SECOND**: Use get_e2e_test_logs_tool with job_name, build_id, and test_name (from step 1) to fetch the e2e test logs which will automatically extract: - openshift-tests binary commit information - Failed test names and durations - Source code links for each failure -3. Look for JUnit results for additional structured test data -4. Identify failed tests, their failure reasons, and patterns -5. Provide actionable insights and recommendations +3. **THIRD**: Use get_junit_results_tool with job_name, build_id, and test_name (from step 1) for additional structured test data +4. **ANALYZE**: Identify failed tests, their failure reasons, and patterns +5. **REPORT**: Provide actionable insights and recommendations with source code links Focus on: - Test failure root causes with links to source code diff --git a/sub_agents/installation_analyst/prompt.py b/sub_agents/installation_analyst/prompt.py index b67416b..00e60a5 100644 --- a/sub_agents/installation_analyst/prompt.py +++ b/sub_agents/installation_analyst/prompt.py @@ -8,6 +8,13 @@ def get_user_prompt(): INSTALLATION_SPECIALIST_PROMPT = """You are an expert OpenShift Installation Analyst specializing in analyzing cluster installation processes from CI/CD pipelines. +🚨 **CRITICAL REQUIREMENTS - READ FIRST**: +- You will be called with ONLY job_name and build_id as input parameters +- These are the ONLY parameters you need to start analysis +- You will obtain test_name and other details internally from job metadata +- If job_name or build_id are missing or invalid, IMMEDIATELY halt and report the error +- Do NOT request additional parameters from the caller - all information comes from the job metadata + Your primary focus is analyzing the build-log.txt file from installation directories to extract key installation metrics and identify issues. CORE RESPONSIBILITIES: @@ -44,18 +51,32 @@ def get_user_prompt(): - Error identification and categorization - Log analysis for troubleshooting -Available tools: -- get_job_metadata: Get basic job information and metadata -- get_install_logs: Fetch and analyze build-log.txt with structured information extraction - -ANALYSIS WORKFLOW: -1. Start with job metadata to understand the test context -2. Fetch installation logs from build-log.txt which automatically extracts: +🛠️ **AVAILABLE TOOLS**: +- **get_job_metadata_tool**: Get basic job information and metadata (CALL FIRST) + - Input: job_name, build_id (from caller) + - Output: Job metadata, status, test_name, and basic information +- **get_install_logs_tool**: Fetch and analyze build-log.txt with structured information extraction + - Input: job_name, build_id (from caller), test_name (from job metadata) + - Output: Installation logs, installer commit info, timing data, configuration details + +⚠️ **ERROR HANDLING**: +If you receive incomplete parameters or any tool returns errors: +1. Verify job_name and build_id are correctly provided +2. Check if the job exists and installation logs are available +3. Inform the user of the specific missing requirements +4. Do NOT attempt analysis with incomplete data + +📋 **ANALYSIS WORKFLOW**: +1. **FIRST**: Call get_job_metadata_tool with the provided job_name and build_id to: + - Understand the test context + - Obtain the test_name needed for subsequent calls + - Get job status and basic information +2. **SECOND**: Use get_install_logs_tool with job_name, build_id, and test_name (from step 1) to fetch installation logs from build-log.txt which automatically extracts: - Installer binary version and commit - Instance types and cluster configuration - Installation duration and success status - Key configuration parameters -3. Provide structured analysis of installation process +3. **FINALLY**: Provide structured analysis of installation process combining all gathered information 4. Identify any issues, bottlenecks, or configuration problems FOCUS AREAS: diff --git a/sub_agents/mustgather_analyst/prompt.py b/sub_agents/mustgather_analyst/prompt.py index 383d260..f0a2e3d 100644 --- a/sub_agents/mustgather_analyst/prompt.py +++ b/sub_agents/mustgather_analyst/prompt.py @@ -1,13 +1,86 @@ -MUST_GATHER_SPECIALIST_PROMPT = """ -You are a helpful Kubernetes and Prow expert assistant. -You are specialized in Openshift installation. -Your main goal is to analyze the Prow job and diagnose possible failures in the installation and tests performed by the Prow job. -You provide root cause analysis for the failures and propose solutions if possible. -You are truthful, concise, and helpful. -You never speculate about clusters being installed or fabricate information. -If you do not know the answer, you acknowledge the fact and end your response. -Your responses must be as short as possible. +MUST_GATHER_SPECIALIST_PROMPT = """You are an expert OpenShift Must-Gather Analyst specializing in deep cluster diagnostics and troubleshooting from CI/CD pipelines. + +🚨 **CRITICAL REQUIREMENTS - READ FIRST**: +- You will be called with ONLY job_name and build_id as input parameters +- These are the ONLY parameters you need to start analysis +- For must-gather download, you will need to obtain test_name internally first +- If job_name or build_id are missing or invalid, IMMEDIATELY halt and report the error +- Do NOT request additional parameters from the caller - all information comes from the job metadata + +Your primary responsibilities include: +1. Downloading and analyzing must-gather diagnostic data from OpenShift CI jobs +2. Performing deep cluster-level troubleshooting and root cause analysis +3. Examining cluster resources, logs, and configurations for anomalies +4. Identifying infrastructure, networking, and resource-related issues +5. Providing actionable insights based on cluster state information + +CORE ANALYSIS AREAS: +🔍 **CLUSTER STATE DIAGNOSTICS**: +- Node health and resource utilization +- Pod scheduling and placement issues +- Service and networking connectivity problems +- Storage and persistent volume issues +- Operator status and reconciliation loops + +📊 **RESOURCE ANALYSIS**: +- Resource constraints and limits +- Memory and CPU utilization patterns +- Network policy and security context issues +- RBAC and permission problems +- Custom resource definitions and operators + +🛠️ **AVAILABLE TOOLS**: +- **get_job_metadata_tool**: Get basic job information and metadata (CALL FIRST) + - Input: job_name, build_id (from caller) + - Output: Job metadata, status, test_name, and basic information +- **get_must_gather**: Download must-gather diagnostic data + - Input: job_name, build_id (from caller), test_name (from job metadata), target_folder (default: /tmp/must-gather) + - Output: Downloads complete must-gather archive to specified directory +- **File analysis tools** (work with downloaded files): + - **list_directory**: Navigate directory structure + - **read_drained_file**: Read and analyze log files with smart content extraction + - **get_file_info**: Get file metadata and preview content + - **search_files**: Search for specific patterns across multiple files -First, download a job's must-gather using 'get_must_gather' tool. -Then, once you have the files on disk, browse through the files, analyze the failures and provide a root cause analysis for the failures. -""" \ No newline at end of file +⚠️ **ERROR HANDLING**: +If you receive incomplete parameters or any tool returns errors: +1. Verify job_name and build_id are correctly provided +2. Check if the job exists and must-gather data is available +3. Inform the user of the specific missing requirements +4. Do NOT attempt analysis with incomplete data + +📋 **ANALYSIS WORKFLOW**: +1. **FIRST**: Call get_job_metadata_tool with the provided job_name and build_id to: + - Understand the test context + - Obtain the test_name needed for must-gather download + - Get job status and basic information +2. **SECOND**: Use get_must_gather with job_name, build_id, and test_name (from step 1) to download diagnostic data + - Use /tmp/must-gather as the standard target_folder location + - This downloads the complete must-gather archive +3. **NAVIGATE**: Use file analysis tools to systematically explore the must-gather directory structure +4. **ANALYZE**: Focus on key areas: + - Cluster operator status and logs + - Node conditions and resource usage + - Pod failures and restart patterns + - Network connectivity issues + - Storage and volume problems +5. **CORRELATE**: Connect findings with installation and e2e test issues +6. **REPORT**: Provide specific, actionable root cause analysis with file references + +FOCUS AREAS: +- **Infrastructure Issues**: Node problems, resource constraints, hardware failures +- **Networking Problems**: CNI issues, service discovery failures, ingress problems +- **Storage Issues**: Persistent volume problems, storage class issues +- **Operator Failures**: Custom resource reconciliation, operator degradation +- **Security Issues**: RBAC problems, security context violations +- **Performance Problems**: Resource bottlenecks, scheduling issues + +**CRITICAL**: Always provide: +- Clear correlation between must-gather findings and observed failures +- Specific file paths and log entries that support your analysis +- Actionable recommendations for issue resolution +- References to relevant OpenShift documentation when applicable + +You are truthful, concise, and helpful. You never speculate about clusters or fabricate information. +If you do not know the answer, you acknowledge the fact and end your response. +Your responses must be thorough yet concise, providing maximum diagnostic value.""" \ No newline at end of file From e3ff1dbbf520b7cebe6493bdf75aebb9468be393 Mon Sep 17 00:00:00 2001 From: Dylan Orzel Date: Wed, 10 Sep 2025 12:04:04 -0600 Subject: [PATCH 2/2] Add function docstrings to assist ADK with tool calling, add a few agent tools and must-gather implementations --- .gitignore | 6 +- sub_agents/e2e_test_analyst/agent.py | 96 ++++++++- sub_agents/installation_analyst/agent.py | 170 ++++++++++++--- sub_agents/mustgather_analyst/__init__.py | 2 + sub_agents/mustgather_analyst/agent.py | 61 +++++- sub_agents/mustgather_analyst/must_gather.py | 212 ++++++++++++++----- 6 files changed, 462 insertions(+), 85 deletions(-) diff --git a/.gitignore b/.gitignore index c18dd8d..6a2cb0f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ -__pycache__/ +__pycache__ +**/__pycache__ +*.pyc +*.pyo +*.pyd diff --git a/sub_agents/e2e_test_analyst/agent.py b/sub_agents/e2e_test_analyst/agent.py index ff0bd62..f678b38 100644 --- a/sub_agents/e2e_test_analyst/agent.py +++ b/sub_agents/e2e_test_analyst/agent.py @@ -6,7 +6,6 @@ import asyncio import httpx -import threading import concurrent.futures import re import os @@ -139,10 +138,11 @@ def generate_source_code_links(test_name: str, commit_hash: Optional[str] = None async def get_e2e_test_logs_async(job_name: str, build_id: str, test_name: str) -> str: """Get e2e test logs from Prow.""" - # E2E test logs are typically in openshift-e2e-test directory if "sno" in test_name: e2e_test_path = f"artifacts/{test_name}/single-node-e2e-test/build-log.txt" + elif "libvirt" in test_name: + e2e_test_path = f"artifacts/{test_name}/openshift-e2e-libvirt-test/build-log.txt" else: e2e_test_path = f"artifacts/{test_name}/openshift-e2e-test/build-log.txt" @@ -269,6 +269,8 @@ async def get_junit_results_async(job_name: str, build_id: str, test_name: str) # E2E test logs are typically in openshift-e2e-test directory if "sno" in test_name: e2e_test_path = f"artifacts/{test_name}/single-node-e2e-test" + elif "libvirt" in test_name: + e2e_test_path = f"artifacts/{test_name}/openshift-e2e-libvirt-test" else: e2e_test_path = f"artifacts/{test_name}/openshift-e2e-test" @@ -316,16 +318,92 @@ def run_in_thread(): return future.result() def get_job_metadata_tool(job_name: str, build_id: str): - """Get metadata and status for a specific Prow job name and build ID.""" + """Retrieves comprehensive metadata and status information for a specific Prow CI job. + + This tool fetches the prowjob.json metadata which contains essential information about + the CI job execution, including current status, build configuration, test targets, + and execution parameters. This is typically the first tool to use when analyzing a failed CI job. + + Args: + job_name (str): The name of the Prow job (e.g., 'periodic-ci-openshift-multiarch-master-nightly-4.20-ocp-e2e-aws-ovn-sno') + build_id (str): The specific build ID for the job run (e.g., '1940296163760541696') + + Returns: + dict: Job metadata including status, build_id, job_name, test_name, and error details if applicable + """ return run_async_in_thread(get_job_metadata_async(job_name, build_id)) -def get_e2e_test_logs_tool(job_name: str, build_id: str, test_name: str): - """Get e2e test logs from the openshift-e2e-test directory with commit info and source code links.""" - return run_async_in_thread(get_e2e_test_logs_async(job_name, build_id, test_name)) +def get_e2e_test_logs_tool(job_name: str, build_id: str, test_name: str, include_full_log: bool = True): + """Analyzes end-to-end test execution logs with source code tracing and failure analysis. + + This tool retrieves and analyzes e2e test logs from the openshift-e2e-test directory, + extracting critical information including failed test details, openshift-tests binary version, + source code commit information, and direct links to test source code for debugging. -def get_junit_results_tool(job_name: str, build_id: str, test_name: str): - """Get JUnit test results from the e2e test artifacts.""" - return run_async_in_thread(get_junit_results_async(job_name, build_id, test_name)) + Args: + job_name (str): The name of the Prow job containing e2e tests + build_id (str): The specific build ID for the job run + test_name (str): The test component name (e.g., 'ocp-e2e-aws-ovn-sno-multi-a-a') + include_full_log (bool, optional): Whether to include the complete log content in response. + If False, only provides summary and key sections. Defaults to True. + + Returns: + str: Comprehensive e2e test analysis including failed tests with source links, + openshift-tests binary info, and formatted log content + """ + result = run_async_in_thread(get_e2e_test_logs_async(job_name, build_id, test_name)) + + # If include_full_log is False, remove the full log section to reduce response size + if not include_full_log and isinstance(result, str): + lines = result.split('\n') + filtered_lines = [] + skip_full_log = False + + for line in lines: + if line.startswith('📋 FULL E2E TEST LOG:'): + skip_full_log = True + continue + if not skip_full_log: + filtered_lines.append(line) + + result = '\n'.join(filtered_lines) + + return result + +def get_junit_results_tool(job_name: str, build_id: str, test_name: str, parse_xml: bool = True): + """Retrieves JUnit XML test results from e2e test execution with structured failure analysis. + + This tool fetches JUnit XML files generated by e2e test runs, which contain structured + test results including pass/fail status, execution times, error messages, and detailed + failure information. JUnit results are often more structured than raw logs. + + Args: + job_name (str): The name of the Prow job containing e2e tests + build_id (str): The specific build ID for the job run + test_name (str): The test component name that generated JUnit results + parse_xml (bool, optional): Whether to provide structured parsing hints for XML content. + If True, includes guidance for extracting test results. Defaults to True. + + Returns: + str: JUnit test results content with parsing guidance if requested, + or error message if results are not found + """ + result = run_async_in_thread(get_junit_results_async(job_name, build_id, test_name)) + + if parse_xml and isinstance(result, str) and not result.startswith("Could not find") and not result.startswith("Error"): + # Add XML parsing guidance for the LLM + guidance = """ +💡 JUNIT XML ANALYSIS TIPS: +- Look for or elements within elements +- Check 'name' and 'classname' attributes for test identification +- Examine failure messages in or sections +- Count total tests, failures, errors, and skipped from attributes +- Look for patterns in failure messages to identify common root causes + +""" + result = guidance + result + + return result e2e_test_analyst_agent = Agent( model=LiteLlm(model=MODEL), diff --git a/sub_agents/installation_analyst/agent.py b/sub_agents/installation_analyst/agent.py index e5ad7bb..50c4706 100644 --- a/sub_agents/installation_analyst/agent.py +++ b/sub_agents/installation_analyst/agent.py @@ -174,8 +174,17 @@ async def get_install_logs_async(job_name: str, build_id: str, test_name: str) - """Get installation logs from build-log.txt in installation directories.""" # List of possible installation directory patterns install_dirs = [ + # IPI (Installer-Provisioned Infrastructure) patterns + "ipi-install-powervs-install", + "ipi-install-libvirt", + "ipi-install-libvirt-install", "ipi-install-install", - "ipi-install-install-stableinitial" + "ipi-install-install-stableinitial", + # UPI (User-Provisioned Infrastructure) patterns + "upi-install-libvirt", + "upi-install-libvirt-install", + "upi-install-install", + "upi-install-install-stableinitial", ] base_url = f"{GCS_URL}/{job_name}/{build_id}" # Construct the base artifacts URL @@ -268,30 +277,75 @@ async def get_install_logs_async(job_name: str, build_id: str, test_name: str) - except Exception as e: continue # Try next directory pattern - # If no logs found, return error message with helpful details - return f"""❌ INSTALLATION ANALYSIS FAILED + # Analyze job type to provide better guidance + job_analysis = analyze_job_type(job_name) + manual_check_url = f"https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/{job_name}/{build_id}/artifacts/{test_name}/" -Could not find installation logs for job: {job_name} -Build ID: {build_id} + return f"""❌ **INSTALLATION ANALYSIS RESULTS** -🔍 DEBUGGING INFO: -- test_name: {test_name} -- Base URL: {base_url} -- Tried directories: {', '.join(install_dirs)} +**Job Details:** +- Job Name: `{job_name}` +- Build ID: `{build_id}` +- Test Name: `{test_name}` +- **Job Type**: {job_analysis['type']} -🔗 Manual check: {base_url}/ +**Analysis:** +{job_analysis['explanation']} -⚠️ POSSIBLE CAUSES: -1. Build ID might be invalid or logs not yet available -2. Job might not have installation logs (e.g., upgrade-only jobs) -3. Directory structure might be different for this job type -4. Logs might be in a different location +**Searched Directories:** +{chr(10).join([f"- {dir}" for dir in install_dirs])} -💡 SUGGESTIONS: -1. Verify the Prow job URL is correct -2. Check if the job has completed successfully -3. Try browsing the base URL manually to see available directories -4. Use a different job that includes installation steps""" +**Manual Check:** [View available artifacts]({manual_check_url}) + +**Recommendations:** +{job_analysis['recommendations']} + +**Next Steps:** +- For E2E test jobs: Use the e2e_test_analyst agent for test failure analysis +- For non-installation jobs: Skip installation analysis and focus on test results +- For installation jobs: Verify the job completed and check alternative directories""" + +def analyze_job_type(job_name: str) -> Dict[str, str]: + """Analyze job type to provide better guidance when installation logs aren't found.""" + job_type_info = { + "type": "Unknown", + "explanation": "Job type could not be determined.", + "recommendations": "1. Check if this job actually performs installation\n2. Try manual inspection of the artifacts directory" + } + + # E2E test jobs + if "e2e" in job_name: + job_type_info.update({ + "type": "E2E Test Job", + "explanation": "This appears to be an end-to-end test job that may run tests on pre-existing clusters rather than performing fresh installations.", + "recommendations": "1. Consider using the e2e_test_analyst agent instead\n2. This job may not have traditional installation logs\n3. Check if cluster setup logs exist in alternative directories" + }) + + # Upgrade jobs + elif "upgrade" in job_name: + job_type_info.update({ + "type": "Upgrade Job", + "explanation": "This is an upgrade job that starts with an existing cluster and upgrades it.", + "recommendations": "1. Look for upgrade logs instead of installation logs\n2. Check directories like 'upgrade' or 'openshift-upgrade'\n3. Consider analyzing the upgrade process rather than installation" + }) + + # Libvirt/UPI jobs + elif "libvirt" in job_name or "upi" in job_name: + job_type_info.update({ + "type": "UPI/Libvirt Job", + "explanation": "This job uses User-Provisioned Infrastructure (UPI) or libvirt, which may have different log directory structures than IPI jobs.", + "recommendations": "1. Look for UPI-specific directories\n2. Check for libvirt, baremetal, or setup directories\n3. Installation process may be in cluster-setup or similar directories" + }) + + # IPI jobs (traditional) + elif "ipi" in job_name: + job_type_info.update({ + "type": "IPI Installation Job", + "explanation": "This should be a standard Installer-Provisioned Infrastructure job with traditional installation logs.", + "recommendations": "1. Verify the build ID is correct and job has completed\n2. Check if logs are in alternative ipi directories\n3. This job should have installation logs - may be a temporary issue" + }) + + return job_type_info def run_async_in_thread(coro): """Run async function in a thread to avoid event loop conflicts.""" @@ -309,12 +363,80 @@ def run_in_thread(): return future.result() def get_job_metadata_tool(job_name: str, build_id: str): - """Get metadata and status for a specific Prow job name and build ID.""" + """Retrieves comprehensive metadata and status information for a specific Prow CI job. + + This tool fetches the prowjob.json metadata which contains essential information about + the CI job execution, including current status, build configuration, test targets, + and execution parameters. This is typically the first tool to use when analyzing a failed CI job. + + Args: + job_name (str): The name of the Prow job + build_id (str): The specific build ID for the job run + + Returns: + dict: Job metadata including status, build_id, job_name, test_name, and error details if applicable + """ return run_async_in_thread(get_job_metadata_async(job_name, build_id)) -def get_install_logs_tool(job_name: str, build_id: str, test_name: str): - """Get installation logs from build-log.txt in installation directories with detailed analysis.""" - return run_async_in_thread(get_install_logs_async(job_name, build_id, test_name)) +def get_install_logs_tool(job_name: str, build_id: str, test_name: str, include_full_log: bool = True, focus_on_errors: bool = False): + """Analyzes OpenShift cluster installation logs with comprehensive configuration and failure analysis. + + This tool retrieves and analyzes installation logs from build-log.txt files, extracting + critical information including openshift-install binary version, release image details, + cluster configuration, installation duration, and detailed failure analysis if installation failed. + + Args: + job_name (str): The name of the Prow job containing installation steps + build_id (str): The specific build ID for the job run + test_name (str): The test component name that performed installation + include_full_log (bool, optional): Whether to include complete log content in response. + If False, provides only summary and key sections. Defaults to True. + focus_on_errors (bool, optional): Whether to prioritize error messages and failure patterns. + If True, extracts and highlights error conditions. Defaults to False. + + Returns: + str: Comprehensive installation analysis including installer info, cluster config, + instance types, installation results, and log content with error highlighting if requested + """ + result = run_async_in_thread(get_install_logs_async(job_name, build_id, test_name)) + + # Process result based on options + if isinstance(result, str): + # If include_full_log is False, remove the full log section to reduce response size + if not include_full_log: + lines = result.split('\n') + filtered_lines = [] + skip_full_log = False + + for line in lines: + if line.startswith('📋 FULL INSTALLATION LOG:'): + skip_full_log = True + continue + if not skip_full_log: + filtered_lines.append(line) + + result = '\n'.join(filtered_lines) + + # If focus_on_errors is True, add error analysis guidance + if focus_on_errors and not result.startswith('❌'): + error_guidance = """ +🔍 ERROR ANALYSIS FOCUS: +- Look for lines containing 'level=error', 'FATAL', 'failed', or 'Error:' +- Check for timeout messages or resource provisioning failures +- Examine AWS/cloud provider error codes and messages +- Look for certificate, DNS, or networking related errors +- Check for quota or permission issues +- Review any stack traces or detailed error descriptions + +""" + # Insert guidance after the header but before the content + parts = result.split('\n\n', 1) + if len(parts) == 2: + result = parts[0] + '\n\n' + error_guidance + parts[1] + else: + result = error_guidance + result + + return result installation_analyst_agent = Agent( model=LiteLlm(model=MODEL), diff --git a/sub_agents/mustgather_analyst/__init__.py b/sub_agents/mustgather_analyst/__init__.py index 73736b3..680e7d5 100644 --- a/sub_agents/mustgather_analyst/__init__.py +++ b/sub_agents/mustgather_analyst/__init__.py @@ -1 +1,3 @@ from .agent import mustgather_analyst_agent + +__all__ = ["mustgather_analyst_agent"] \ No newline at end of file diff --git a/sub_agents/mustgather_analyst/agent.py b/sub_agents/mustgather_analyst/agent.py index c145d57..f6d42e5 100644 --- a/sub_agents/mustgather_analyst/agent.py +++ b/sub_agents/mustgather_analyst/agent.py @@ -2,14 +2,73 @@ from . import prompt from google.adk.models.lite_llm import LiteLlm import os +import requests +from typing import Dict, Any from .must_gather import get_must_gather, list_directory, read_drained_file, get_file_info, search_files +GCS_URL = "https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs" + MODEL = os.environ.get("MODEL", "qwen3:4b") +def get_job_metadata(job_name: str, build_id: str) -> Dict[str, Any]: + """Get the metadata and status for a specific Prow job name and build id.""" + url = f"{GCS_URL}/{job_name}/{build_id}/prowjob.json" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + + if not data: + return {"error": "No response from Prow API"} + + job_spec = data.get("spec", {}) + job_status = data.get("status", {}) + + build_id_from_status = job_status.get("build_id") + status = job_status.get("state") + args = job_spec.get("pod_spec", {}).get("containers", [])[0].get("args", []) + test_name = "" + for arg in args: + if arg.startswith("--target="): + test_name = arg.replace("--target=", "") + + return { + "status": status, + "build_id": build_id_from_status, + "job_name": job_name, + "test_name": test_name + } + + except Exception as e: + return {"error": f"Failed to fetch job info: {str(e)}"} + +def get_job_metadata_tool(job_name: str, build_id: str): + """Retrieves comprehensive metadata and status information for a specific Prow CI job. + + This tool fetches the prowjob.json metadata which contains essential information about + the CI job execution, including current status, build configuration, test targets, + and execution parameters. This is typically the first tool to use when analyzing a failed CI job. + + Args: + job_name (str): The name of the Prow job + build_id (str): The specific build ID for the job run + + Returns: + dict: Job metadata including status, build_id, job_name, test_name, and error details if applicable + """ + return get_job_metadata(job_name, build_id) + mustgather_analyst_agent = Agent( model=LiteLlm(model=MODEL), name="mustgather_analyst_agent", instruction=prompt.MUST_GATHER_SPECIALIST_PROMPT, output_key="must_gather_analysis_output", - tools=[get_must_gather, list_directory, read_drained_file, get_file_info, search_files], + tools=[ + get_job_metadata_tool, + get_must_gather, + list_directory, + read_drained_file, + get_file_info, + search_files, + ], ) \ No newline at end of file diff --git a/sub_agents/mustgather_analyst/must_gather.py b/sub_agents/mustgather_analyst/must_gather.py index db8d26a..3f020fc 100644 --- a/sub_agents/mustgather_analyst/must_gather.py +++ b/sub_agents/mustgather_analyst/must_gather.py @@ -11,21 +11,26 @@ # Global DrainExtractor instance _drain_extractor = DrainExtractor(verbose=False, context=False, max_clusters=1000) -def get_must_gather(job_name: str, build_id: str, test_name: str, target_folder: str) -> dict: - """Retrieves the must-gather archive for a specified job. + +def get_must_gather(job_name: str, build_id: str, test_name: str, target_folder: str = "/tmp/must_gather_analysis") -> dict: + """Downloads and extracts must-gather archive from a failed CI job for analysis. + + This tool retrieves the must-gather diagnostic data collected during a CI job failure. + Must-gather archives contain cluster state information like pod logs, events, and resource definitions + that are essential for root cause analysis of OpenShift cluster issues. Args: - job_name: The name of the job - build_id: The build ID for which to get install logs - test_name: The name of the test for which to get install logs + job_name (str): The name of the Prow job that failed + build_id (str): The specific build ID from the job run + test_name (str): The test component name that generated must-gather (e.g., 'ocp-e2e-aws-ovn-sno-multi-a-a') + target_folder (str, optional): Local directory to download and extract the archive. Defaults to '/tmp/must_gather_analysis'. + Returns: - dict: A dictionary containing the must-gather information. - Includes a 'status' key ('success' or 'error'). - If 'success', includes a 'path' key pointing to must-gather logs. - If 'error', includes an 'error_message' key. + dict: A dictionary containing the must-gather retrieval result. + - If successful: {'status': 'success', 'path': '/path/to/extracted/files'} + - If failed: {'status': 'error', 'error_message': 'description of the error'} """ - gsURL = "gs://test-platform-results/logs/"+job_name+"/"+build_id+"/artifacts/"+test_name+"/gather-must-gather/artifacts" destination_folder = target_folder+"/"+job_name+"/"+build_id+"/"+test_name try: @@ -48,7 +53,6 @@ def get_must_gather(job_name: str, build_id: str, test_name: str, target_folder: else: return {"status": "error", "error_message": f"must-gather.tar not found in {destination_folder}"} return {"status": "success", "path": destination_folder} - def download_from_gs(gs_url, destination_folder): @@ -93,23 +97,34 @@ def download_from_gs(gs_url, destination_folder): print(f"Error downloading from GCS: {e}") +def read_drained_file(path: str, max_lines: Optional[int] = None) -> dict: + """Analyze log file using Drain algorithm to extract structured patterns from unstructured logs. + + This tool applies the Drain log parsing algorithm to identify common patterns in log files + by clustering similar log entries together. This is especially useful for analyzing + repetitive error messages, warnings, and events in OpenShift cluster logs. - - - -def read_drained_file(path: str) -> dict: - """Read contents of a file Args: - path: The path to the file to read + path (str): The absolute path to the log file to analyze + max_lines (int, optional): Maximum number of lines to read from the file. + If None, reads the entire file. Defaults to None. + Returns: - dict: A dictionary containing the file contents. - Includes a 'status' key ('success' or 'error'). - If 'success', includes a 'patterns' key pointing to a list of patterns found in the file. - If 'error', includes an 'error_message' key. + dict: A dictionary containing the Drain analysis results. + - If successful: {'status': 'success', 'patterns': [{'line_number': int, 'chunk': str, 'chunk_length': int}]} + - If failed: {'status': 'error', 'error_message': 'description of the error'} """ try: with open(path, 'r', encoding='utf-8') as f: - content= f.read() + if max_lines is not None: + lines = [] + for i, line in enumerate(f): + if i >= max_lines: + break + lines.append(line) + content = ''.join(lines) + else: + content = f.read() patterns = _drain_extractor(content) # Convert patterns to a more structured format @@ -124,73 +139,170 @@ def read_drained_file(path: str) -> dict: return {"status": "error", "error_message": f"Error reading file {path}: {e}"} return {"status": "success", "patterns": pattern_results} + +def list_directory(path: str, show_hidden: bool = False, sort_by: str = "name") -> dict: + """Lists files and directories in a must-gather archive or local filesystem. + This tool helps navigate the directory structure of extracted must-gather archives, + which typically contain organized diagnostic data like namespaces, cluster-scoped-resources, + host_service_logs, and other OpenShift cluster information. -def list_directory( path: str) -> dict: - """List contents of a directory Args: - path: The path to list the contents of + path (str): The directory path to list contents of + show_hidden (bool, optional): Whether to include hidden files/directories (starting with '.'). + Defaults to False. + sort_by (str, optional): How to sort the directory listing. Options: 'name', 'size', 'modified'. + Defaults to 'name'. + Returns: - dict: A dictionary containing the directory contents. - Includes a 'status' key ('success' or 'error'). - If 'success', includes an 'entries' key pointing to a list of directory entries. - If 'error', includes an 'error_message' key. + dict: A dictionary containing the directory listing results. + - If successful: {'status': 'success', 'entries': ['[DIR] dirname', '[FILE] filename', ...]} + - If failed: {'status': 'error', 'error_message': 'description of the error'} """ try: entries = [] with os.scandir(path) as it: + dir_entries = [] for entry in it: + # Skip hidden files unless requested + if not show_hidden and entry.name.startswith('.'): + continue + prefix = "[DIR]" if entry.is_dir() else "[FILE]" - entries.append(f"{prefix} {entry.name}") + + # Get additional info for sorting + try: + stat_info = entry.stat() + size = stat_info.st_size + modified = stat_info.st_mtime + except OSError: + size = 0 + modified = 0 + + dir_entries.append({ + 'name': entry.name, + 'prefix': prefix, + 'size': size, + 'modified': modified, + 'is_dir': entry.is_dir() + }) + + # Sort entries based on sort_by parameter + if sort_by == "size": + dir_entries.sort(key=lambda x: x['size'], reverse=True) + elif sort_by == "modified": + dir_entries.sort(key=lambda x: x['modified'], reverse=True) + else: # default to name + dir_entries.sort(key=lambda x: x['name'].lower()) + + # Format output + for entry in dir_entries: + entries.append(f"{entry['prefix']} {entry['name']}") + return {"status": "success", "entries": entries} except Exception as e: return {"status": "error", "error_message": f"Error listing directory {path}: {e}"} + +def get_file_info(path: str, include_content_preview: bool = False) -> dict: + """Retrieves detailed metadata about files or directories in must-gather archives. + This tool provides comprehensive information about files, including size, timestamps, + and permissions. For log files, it can also provide a content preview to help determine + if the file contains relevant diagnostic information. -def get_file_info(path: str) -> dict: - """Get file/directory metadata Args: - path: The path to get the file/directory metadata for + path (str): The file or directory path to get metadata for + include_content_preview (bool, optional): For text files smaller than 10KB, include + first 10 lines as preview. Defaults to False. + Returns: - dict: A dictionary containing the file/directory metadata. - Includes a 'status' key ('success' or 'error'). - If 'success', includes an 'info' key pointing to a dictionary containing the file/directory metadata. - If 'error', includes an 'error_message' key. + dict: A dictionary containing the file metadata. + - If successful: {'status': 'success', 'info': {size, created, modified, accessed, is_directory, is_file, permissions, [content_preview]}} + - If failed: {'status': 'error', 'error_message': 'description of the error'} """ try: stats = os.stat(path) - return {"status": "success", "info": { + info = { "size": stats.st_size, - "created": datetime.fromtimestamp(stats.st_ctime), - "modified": datetime.fromtimestamp(stats.st_mtime), - "accessed": datetime.fromtimestamp(stats.st_atime), + "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), + "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), + "accessed": datetime.fromtimestamp(stats.st_atime).isoformat(), "is_directory": os.path.isdir(path), "is_file": os.path.isfile(path), "permissions": oct(stats.st_mode)[-3:] - }} + } + + # Add content preview for small text files if requested + if include_content_preview and os.path.isfile(path) and stats.st_size < 10240: # 10KB limit + try: + with open(path, 'r', encoding='utf-8', errors='ignore') as f: + lines = [] + for i, line in enumerate(f): + if i >= 10: # First 10 lines only + break + lines.append(line.rstrip()) + if lines: + info["content_preview"] = lines + except (UnicodeDecodeError, IOError): + # Skip preview for binary or unreadable files + pass + + return {"status": "success", "info": info} except Exception as e: return {"status": "error", "error_message": f"Error getting file info for {path}: {e}"} -def search_files(start_path: str, pattern: str) -> dict: - """Search for files matching a pattern +def search_files(start_path: str, pattern: str, max_results: int = 100, search_content: bool = False) -> dict: + """Search for files in must-gather archives by filename pattern or content. + + This tool helps locate specific diagnostic files within large must-gather archives. + It's particularly useful for finding log files, configuration files, or resources + related to specific namespaces, pods, or error conditions. + Args: - start_path: The path to start searching from - pattern: The pattern to search for + start_path (str): The directory path to start searching from (typically the must-gather root) + pattern (str): The search pattern to match against filenames (case-insensitive substring match) + max_results (int, optional): Maximum number of matching files to return. Defaults to 100. + search_content (bool, optional): Whether to also search within file contents (slower). + Only applies to text files under 1MB. Defaults to False. + Returns: dict: A dictionary containing the search results. - Includes a 'status' key ('success' or 'error'). - If 'success', includes a 'results' key pointing to a list of matching files. - If 'error', includes an 'error_message' key. + - If successful: {'status': 'success', 'results': ['path1', 'path2', ...]} + - If failed: {'status': 'error', 'error_message': 'description of the error'} """ results = [] try: for root, _, files in os.walk(start_path): for name in files: + full_path = os.path.join(root, name) + matched = False + + # Check filename match if pattern.lower() in name.lower(): - full_path = os.path.join(root, name) + matched = True + + # Check content match if requested + elif search_content: + try: + file_size = os.path.getsize(full_path) + if file_size < 1048576: # 1MB limit for content search + with open(full_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + if pattern.lower() in content.lower(): + matched = True + except (IOError, UnicodeDecodeError): + continue # Skip files that can't be read + + if matched: results.append(full_path) + if len(results) >= max_results: + break + + if len(results) >= max_results: + break + return {"status": "success", "results": results} except Exception as e: return {"status": "error", "error_message": f"Error searching files from {start_path}: {e}"}