diff --git a/ci_analysis_agent/agent.py b/ci_analysis_agent/agent.py index dcf736f..3875ab7 100644 --- a/ci_analysis_agent/agent.py +++ b/ci_analysis_agent/agent.py @@ -22,9 +22,16 @@ from sub_agents.installation_analyst import installation_analyst_agent from sub_agents.e2e_test_analyst import e2e_test_analyst_agent from sub_agents.mustgather_analyst import mustgather_analyst_agent - +from sub_agents.arch_mismatch_detector import arch_mismatch_detector_agent import os MODEL = os.environ.get("MODEL", "ollama_chat/qwen3:4b") + +def get_job_name(url: str) -> str: + return url.split("/")[-2] + +def get_build_id(url: str) -> str: + return url.split("/")[-1] + ci_analysis_advisor = LlmAgent( name="ci_analysis_advisor", model=LiteLlm(model=MODEL), @@ -34,7 +41,10 @@ instruction=prompt.CI_ANALYSIS_COORDINATOR_PROMPT, output_key="ci_analysis_advisor_output", tools=[ + get_job_name, + get_build_id, AgentTool(agent=installation_analyst_agent), + AgentTool(agent=arch_mismatch_detector_agent), AgentTool(agent=e2e_test_analyst_agent), AgentTool(agent=mustgather_analyst_agent), ], diff --git a/ci_analysis_agent/prompt.py b/ci_analysis_agent/prompt.py index f245f66..1a75434 100644 --- a/ci_analysis_agent/prompt.py +++ b/ci_analysis_agent/prompt.py @@ -46,7 +46,7 @@ 2. ALWAYS perform e2e test analysis to identify test failures and patterns 3. Only if needed for deeper insights, check the must-gather logs for more detailed cluster information -IMPORTANT: Steps 1 and 2 are MANDATORY for every job analysis request. Do not skip e2e analysis. +IMPORTANT: Steps 1,2 and 3 are MANDATORY for every job analysis request. Do not skip e2e analysis. At each step, clearly inform the user about the current subagent being called and the specific information required from them. After each subagent completes its task, explain the output provided and how it contributes to the overall root cause analysis process. @@ -60,6 +60,12 @@ Action: Parse the URL for the job_name and build_id. Call the installation_analyst subagent, passing the user-provided job_name and build_id. Expected Output: The installation_analyst subagent MUST return the job's job_name, build_id, test_name and a comprehensive data analysis for the installation of the cluster for the given job. +* Arch Mismatch Analysis (Subagent: arch_mismatch_detector) - MANDATORY + +Input: In the input for the function call (arch_mismatch_detector tool), provide the job_name and build_id. +Action: Call the arch_mismatch_detector subagent, passing the user-provided job_name and build_id. +Expected Output: The arch_mismatch_detector subagent MUST return a comprehensive analysis on existence of arch mismatch errors (binaries with exec format error) in the job. + * E2E Test Analysis (Subagent: e2e_test_analyst) - MANDATORY Input: The installation_analysis_output from the installation_analyst subagent. diff --git a/quick-start-containers.sh b/quick-start-containers.sh index 72bbdff..6ca4517 100755 --- a/quick-start-containers.sh +++ b/quick-start-containers.sh @@ -15,8 +15,10 @@ NC='\033[0m' # No Color # Configuration OLLAMA_CONTAINER="ollama" AGENT_CONTAINER="ci-analysis-agent" +LOKI_MCP_CONTAINER="loki-mcp" OLLAMA_VOLUME="ollama-data" OLLAMA_MODEL="qwen3:4b" +MCP_SERVER_PORT="8888" AGENT_PORT="8000" OLLAMA_PORT="11434" USE_GPU="auto" # auto, nvidia, amd, none @@ -175,6 +177,11 @@ cleanup_existing() { podman stop "$AGENT_CONTAINER" 2>/dev/null || true podman rm "$AGENT_CONTAINER" 2>/dev/null || true fi + + if podman container exists "$LOKI_MCP_CONTAINER" 2>/dev/null; then + podman stop "$LOKI_MCP_CONTAINER" 2>/dev/null || true + podman rm "$LOKI_MCP_CONTAINER" 2>/dev/null || true + fi print_success "Cleanup completed" } @@ -280,6 +287,28 @@ start_agent() { print_success "CI Analysis Agent container started" } +build_loki_mcp() { + print_status "Building Loki MCP container..." + rm -rf loki-mcp + git clone https://github.com/sherine-k/loki-mcp.git +# also make sure you checkout branch datasource which has the mcp tool grafana_loki_query + cd loki-mcp + git checkout datasource + cd .. + podman build -t loki-mcp:latest loki-mcp + print_success "Loki MCP container built" +} + +start_loki_mcp() { + print_status "Starting Loki MCP container..." + podman run -d --name "$LOKI_MCP_CONTAINER" -e "LOKI_TOKEN=$LOKI_TOKEN" -e MCP_TRANSPORT=http=stream -p "$MCP_SERVER_PORT:8080" loki-mcp:latest + print_success "Loki MCP container started" + # Wait for MCP server to be ready + print_status "Waiting for MCP server to be ready..." + sleep 2 + +} + # Function to verify deployment verify_deployment() { print_status "Verifying deployment..." @@ -321,6 +350,19 @@ stop_containers() { print_status "Stopping CI Analysis Agent containers..." # Stop containers + + if podman container exists "$LOKI_MCP_CONTAINER" 2>/dev/null; then + if podman ps | grep -q "$LOKI_MCP_CONTAINER"; then + print_status "Stopping Loki MCP container..." + podman stop "$LOKI_MCP_CONTAINER" + print_success "Loki MCP container stopped" + else + print_warning "Loki MCP container is not running" + fi + else + print_warning "CI Analysis Agent container does not exist" + fi + if podman container exists "$AGENT_CONTAINER" 2>/dev/null; then if podman ps | grep -q "$AGENT_CONTAINER"; then print_status "Stopping CI Analysis Agent container..." @@ -351,10 +393,10 @@ stop_containers() { echo "=================================================================" echo "" echo "📊 Container Status:" - podman ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "$OLLAMA_CONTAINER|$AGENT_CONTAINER" || echo " No containers found" + podman ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "$OLLAMA_CONTAINER|$AGENT_CONTAINER|$LOKI_MCP_CONTAINER" || echo " No containers found" echo "" echo "🎯 Quick Commands:" - echo " • Start containers: podman start $OLLAMA_CONTAINER $AGENT_CONTAINER" + echo " • Start containers: podman start $OLLAMA_CONTAINER $AGENT_CONTAINER $LOKI_MCP_CONTAINER" echo " • Clean up all: $0 --clean-all" echo " • Remove volumes: $0 --remove-volumes" echo " • Remove images: $0 --remove-images" @@ -373,10 +415,15 @@ clean_all() { # Stop containers first print_status "Stopping containers..." - podman stop "$OLLAMA_CONTAINER" "$AGENT_CONTAINER" 2>/dev/null || true + podman stop "$LOKI_MCP_CONTAINER" "$AGENT_CONTAINER" "$OLLAMA_CONTAINER" 2>/dev/null || true # Remove containers print_status "Removing containers..." + if podman container exists "$LOKI_MCP_CONTAINER" 2>/dev/null; then + podman rm -f "$LOKI_MCP_CONTAINER" 2>/dev/null || true + print_success "Removed Loki MCP container" + fi + if podman container exists "$AGENT_CONTAINER" 2>/dev/null; then podman rm -f "$AGENT_CONTAINER" 2>/dev/null || true print_success "Removed CI Analysis Agent container" @@ -437,6 +484,12 @@ clean_all() { print_success "Removed image: ollama/ollama:latest" fi + # Remove Loki MCP image + if podman image exists "loki-mcp:latest" 2>/dev/null; then + podman rmi -f "loki-mcp:latest" 2>/dev/null || true + print_success "Removed image: loki-mcp:latest" + fi + # Remove any other related images for image in $(podman images --format "{{.Repository}}:{{.Tag}}" 2>/dev/null | grep -E "ci-analysis|ollama" || true); do if [ -n "$image" ] && [ "$image" != "ollama/ollama:latest" ] && [ "$image" != "ci-analysis-agent:latest" ]; then @@ -454,13 +507,13 @@ clean_all() { echo "📊 Remaining Resources:" echo "" echo "Containers:" - podman ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "$OLLAMA_CONTAINER|$AGENT_CONTAINER|ci-analysis" || echo " No related containers found" + podman ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "$OLLAMA_CONTAINER|$AGENT_CONTAINER|$LOKI_MCP_CONTAINER|ci-analysis" || echo " No related containers found" echo "" echo "Volumes:" podman volume ls --format "table {{.Name}}\t{{.Driver}}" | grep -E "ollama|ci-analysis" || echo " No related volumes found" echo "" echo "Images:" - podman images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}" | grep -E "ollama|ci-analysis" || echo " No related images found" + podman images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}" | grep -E "ollama|ci-analysis|loki-mcp" || echo " No related images found" echo "" echo "🎯 Next Steps:" echo " • Fresh deployment: $0" @@ -715,7 +768,9 @@ main() { else print_status "Skipping Ollama setup (using remote vLLM)" fi - + + build_loki_mcp + start_loki_mcp start_agent verify_deployment show_status "$gpu_type" diff --git a/requirements.txt b/requirements.txt index 58c916e..f014b1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ litellm>=1.74.0 drain3>=0.9.0 google-cloud-storage>=2.10.0 python-dotenv>=1.0.0 -httpx>=0.24.0 \ No newline at end of file +httpx>=0.24.0 +pytest \ No newline at end of file diff --git a/sub_agents/arch_mismatch_detector/__init__.py b/sub_agents/arch_mismatch_detector/__init__.py new file mode 100644 index 0000000..222581c --- /dev/null +++ b/sub_agents/arch_mismatch_detector/__init__.py @@ -0,0 +1,3 @@ +from .agent import arch_mismatch_detector_agent + +__all__ = ["arch_mismatch_detector_agent"] \ No newline at end of file diff --git a/sub_agents/arch_mismatch_detector/agent.py b/sub_agents/arch_mismatch_detector/agent.py new file mode 100644 index 0000000..b12dace --- /dev/null +++ b/sub_agents/arch_mismatch_detector/agent.py @@ -0,0 +1,97 @@ +"""Installation Analyst Agent for analyzing CI installation logs.""" + +from google.adk import Agent +from google.adk.models.lite_llm import LiteLlm +from . import prompt +from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset, StreamableHTTPConnectionParams +from dotenv import load_dotenv + + +import asyncio +import httpx +import threading +import concurrent.futures +import os +from typing import Dict, Any +from datetime import datetime + +GCS_URL = "https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs" + +MODEL = os.environ.get("MODEL", "qwen3:4b") + +load_dotenv() + +def run_async_in_thread(coro): + """Run async function in a thread to avoid event loop conflicts.""" + + def run_in_thread(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_in_thread) + return future.result() + +def get_job_start_and_end_time_tool(job_name: str, build_id: str)-> Dict[str, Any]: + """Get the start and end time of a job.""" + return run_async_in_thread(get_job_start_and_end_time_async(job_name, build_id)) + +async def get_job_start_and_end_time_async(job_name: str, build_id: str)-> Dict[str, Any]: + """Get the start and end time of a job.""" + start_time = None + end_time = None + url_started = f"{GCS_URL}/{job_name}/{build_id}/started.json" + url_finished = f"{GCS_URL}/{job_name}/{build_id}/finished.json" + try: + async with httpx.AsyncClient() as client: + response_started = await client.get(url_started) + response_started.raise_for_status() + data_started = response_started.json() + if not data_started: + return {"error": "No response from Prow API for started.json"} + start_time = data_started["timestamp"] + async with httpx.AsyncClient() as client: + response_finished = await client.get(url_finished) + response_finished.raise_for_status() + data_finished = response_finished.json() + if not data_finished: + return {"error": "No response from Prow API for finished.json"} + end_time = data_finished["timestamp"] + # Convert epoch timestamps to RFC 3339 format + if start_time: + start_time = datetime.fromtimestamp(start_time).strftime('%Y-%m-%dT%H:%M:%SZ') + if end_time: + end_time = datetime.fromtimestamp(end_time).strftime('%Y-%m-%dT%H:%M:%SZ') + return { + "start_time": start_time, + "end_time": end_time + } + except Exception as e: + return {"error": f"Failed to fetch job start and end time: {str(e)}"} + +arch_mismatch_detector_agent = Agent( + model=LiteLlm(model=MODEL), + # model="gemini-2.0-flash", + name="arch_mismatch_detector_agent", + instruction=prompt.ARCH_MISMATCH_DETECTOR_PROMPT, + output_key="arch_mismatch_detector_output", + tools=[ + get_job_start_and_end_time_tool, + MCPToolset( + connection_params=StreamableHTTPConnectionParams( + url="http://127.0.0.1:8888/stream" + ), + tool_filter=[ + "grafana_loki_query", + ], + + ), + + ], +) + +root_agent = arch_mismatch_detector_agent \ No newline at end of file diff --git a/sub_agents/arch_mismatch_detector/prompt.py b/sub_agents/arch_mismatch_detector/prompt.py new file mode 100644 index 0000000..7bd2dd6 --- /dev/null +++ b/sub_agents/arch_mismatch_detector/prompt.py @@ -0,0 +1,42 @@ +"""Prompts for Installation Analyst Agent.""" + +ARCH_MISMATCH_DETECTOR_PROMPT = f""" +You are the Arch Mismatch Detector agent. You are a grafana loki expert. + +Objective: +- Retrieve and analyze Grafana Loki logs for a specific CI job invocation to identify architecture mismatch errors, specifically messages matching the case-insensitive pattern "exec format". + +Required user inputs in each request: +- job_name: the name of the job to search for. +- build_id: the id of the build to search for. + +Workflow: +- Use the get_job_start_and_end_time_tool to get the start and end time of the job from the prow job_name and build_id. +- Prepare the inputs for the grafana_loki_query tool + - orgId: 1 + - datasource uid: PCEB727DF2F34084E (DPCR Loki) + - url: https://grafana-loki.ci.openshift.org/api/ds/query + - expr: + - Replace %job_name%, %build_id% by the values provided in the user inputs in the following expression: + {{invoker="openshift-internal-ci/%job_name%/%build_id%"}} |~ "(?i)exec format" +- Set the start_time and end_time to the values provided by the get_job_start_and_end_time_tool. +- Use the grafana_loki_query tool to query the logs for the job. +- Analyze the logs for the job. +- Return the analysis. + +Tool invocation contract (grafana_loki_query): +- Parameters you must provide: + - datasource uid: PCEB727DF2F34084E (DPCR Loki) + - orgId: 1 + - url: https://grafana-loki.ci.openshift.org/api/ds/query + - expr: the expression above prepared by the workflow + - start: start_time in ISO 8601 / RFC3339 format + - end: end_time in ISO 8601 / RFC3339 format + +Response style: +- Keep outputs concise and focused on the error pattern. +- When no logs are found, this indicates that there are no arch mismatch errors in the job. +- Report total matches, and surface 3–5 representative lines with timestamps. +- Briefly note any repeated message patterns or clusters. +- Provide a convenience link for further inspection in Grafana Explore with orgId=1 and the requested time range, e.g., https://grafana-loki.ci.openshift.org/explore?orgId=1 +""" \ No newline at end of file diff --git a/sub_agents/e2e_test_analyst/agent.py b/sub_agents/e2e_test_analyst/agent.py index ff0bd62..a41c602 100644 --- a/sub_agents/e2e_test_analyst/agent.py +++ b/sub_agents/e2e_test_analyst/agent.py @@ -329,6 +329,7 @@ def get_junit_results_tool(job_name: str, build_id: str, test_name: str): e2e_test_analyst_agent = Agent( model=LiteLlm(model=MODEL), + description="Analyzes e2e test logs and provides detailed analysis.", name="e2e_test_analyst_agent", instruction=prompt.E2E_TEST_SPECIALIST_PROMPT, output_key="e2e_test_analysis_output", diff --git a/sub_agents/installation_analyst/agent.py b/sub_agents/installation_analyst/agent.py index e5ad7bb..346f786 100644 --- a/sub_agents/installation_analyst/agent.py +++ b/sub_agents/installation_analyst/agent.py @@ -317,6 +317,7 @@ def get_install_logs_tool(job_name: str, build_id: str, test_name: str): return run_async_in_thread(get_install_logs_async(job_name, build_id, test_name)) installation_analyst_agent = Agent( + description="Analyzes installation logs and provides detailed analysis.", model=LiteLlm(model=MODEL), name="installation_analyst_agent", instruction=prompt.INSTALLATION_SPECIALIST_PROMPT,