diff --git a/.github/workflows/ai-ptp-triage.yaml b/.github/workflows/ai-ptp-triage.yaml new file mode 100644 index 000000000..a6ce8cbf4 --- /dev/null +++ b/.github/workflows/ai-ptp-triage.yaml @@ -0,0 +1,241 @@ +name: AI-Powered PTP Triage + +on: + issue_comment: + types: [created] + +permissions: + contents: read + issues: write + pull-requests: read + +jobs: + ai-triage: + runs-on: ubuntu-latest + if: | + github.event.issue.state == 'open' && + contains(github.event.comment.body, '@ai-triage') && + (contains(github.event.issue.title, 'PTP') || contains(github.event.issue.labels.*.name, 'ptp')) + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Node.js for MCP servers + uses: actions/setup-node@v4 + with: + node-version: '18' + + - name: Install dependencies + run: | + # Install GitHub CLI for issue operations + curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null + sudo apt update + sudo apt install gh -y + + # Install Python dependencies for Gemini + pip install --upgrade pip + pip install google-generativeai requests + + - name: Prepare AI analysis environment + run: | + echo "๐Ÿค– Setting up autonomous AI agent environment" + echo "Agent will analyze PTP failures using Gemini AI with issue context" + + - name: Create Gemini CLI autonomous agent script + run: | + cat > gemini_agent.py << 'EOF' + #!/usr/bin/env python3 + """ + Autonomous Gemini agent for PTP failure analysis + Architecture: GitHub Actions โ†’ Gemini AI โ†’ GitHub Issue Analysis + """ + import os + import json + import subprocess + import google.generativeai as genai + from typing import Dict, Any + + class PTPFailureAgent: + def __init__(self, gemini_api_key: str, github_token: str): + self.gemini_api_key = gemini_api_key + self.github_token = github_token + genai.configure(api_key=gemini_api_key) + self.model = genai.GenerativeModel('gemini-pro') + + def analyze_ptp_failure(self, repo: str, issue_number: str) -> str: + """Main ReAct loop for autonomous PTP failure analysis""" + + # Step 1: REASON - Understand the task + reasoning_prompt = f""" + You are an autonomous AI agent analyzing PTP CI failures. + + TASK: Analyze PTP failure in issue #{issue_number} from {repo} + + REASONING: I need to analyze the PTP failure information and provide: + 1. Root cause analysis of the PTP timing issues + 2. Specific investigation steps for the engineering team + 3. Actionable recommendations for fixing the failure + 4. Priority assessment based on PTP accuracy requirements + + Focus on PTP-specific timing, synchronization, and hardware issues. + """ + + # Step 2: ACT - Execute the analysis + print("๐Ÿง  Starting ReAct analysis loop...") + + analysis = self._perform_analysis(repo, issue_number) + return analysis + + def _perform_analysis(self, repo: str, issue_number: str) -> str: + """Perform the actual PTP failure analysis""" + + # Simplified analysis using available information + prompt = f""" + You are a PTP (Precision Time Protocol) expert analyzing CI test failures. + + CONTEXT: + - Repository: {repo} + - Issue: #{issue_number} + - Job: e2e-telco5g-ptp-upstream (OpenShift CI) + - Focus: PTP timing synchronization for telecom/5G workloads + + ANALYSIS FRAMEWORK: + 1. **Root Cause Categories**: + - PTP daemon configuration (ptp4l, phc2sys) + - Hardware clock synchronization issues + - Network timing precision problems + - Test environment limitations + - Code regressions in PTP operator + + 2. **Investigation Steps**: + - Check PTP pod logs for sync failures + - Analyze clock offset measurements + - Verify PTP hardware capability + - Review recent code changes + + 3. **Priority Assessment**: + - High: Sync accuracy > 1ฮผs deviation + - Medium: Intermittent sync issues + - Low: Test flakiness without timing impact + + **Your Task**: Provide a comprehensive PTP failure analysis with: + - Root cause hypothesis + - Specific debugging steps + - Fix recommendations + - Priority level + + Focus on actionable insights for PTP engineers. + """ + + try: + response = self.model.generate_content(prompt) + return response.text + except Exception as e: + return f"Analysis failed: {str(e)}" + + def post_github_comment(self, repo: str, issue_number: str, analysis: str) -> bool: + """Post analysis to GitHub issue""" + comment_body = f"""## ๐Ÿค– Autonomous AI PTP Failure Analysis + + {analysis} + + --- + **Analysis Details:** + - **Agent**: Gemini-powered autonomous AI agent + - **Specialization**: PTP timing synchronization for OpenShift/Kubernetes + - **Focus**: Precision Time Protocol failures in telecom/5G workloads + - **Trigger**: @ai-triage comment + + **Next Steps:** + 1. Review the analysis above + 2. Follow the recommended investigation steps + 3. Check PTP pod logs and timing measurements + 4. Comment `@ai-create-fix` for automated fix proposals (coming soon) + + --- + *Generated by Autonomous Gemini Agent for PTP Failure Analysis* + """ + + # Use GitHub CLI for posting comment + try: + cmd = [ + 'gh', 'issue', 'comment', issue_number, + '--repo', repo, + '--body', comment_body + ] + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env={**os.environ, 'GH_TOKEN': self.github_token} + ) + + if result.returncode == 0: + print("โœ… Analysis posted to GitHub issue") + return True + else: + print(f"โŒ Failed to post comment: {result.stderr}") + return False + + except Exception as e: + print(f"โŒ Error posting comment: {str(e)}") + return False + + def main(): + """Main execution function""" + # Get environment variables + gemini_api_key = os.environ.get('GEMINI_API_KEY') + github_token = os.environ.get('GITHUB_TOKEN') + repo = os.environ.get('GITHUB_REPOSITORY') + issue_number = os.environ.get('ISSUE_NUMBER') + + if not all([gemini_api_key, github_token, repo, issue_number]): + print("โŒ Missing required environment variables") + print("Required: GEMINI_API_KEY, GITHUB_TOKEN, GITHUB_REPOSITORY, ISSUE_NUMBER") + return 1 + + try: + print(f"๐Ÿš€ Starting PTP failure analysis for issue #{issue_number}") + + # Initialize autonomous agent + agent = PTPFailureAgent(gemini_api_key, github_token) + + # Perform analysis + analysis = agent.analyze_ptp_failure(repo, issue_number) + + # Post results + success = agent.post_github_comment(repo, issue_number, analysis) + + return 0 if success else 1 + + except Exception as e: + print(f"โŒ Agent execution failed: {str(e)}") + return 1 + + if __name__ == "__main__": + exit(main()) + EOF + + chmod +x gemini_agent.py + + - name: Run Autonomous Gemini Agent + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + run: python gemini_agent.py + + - name: React to trigger comment + if: always() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Add a rocket reaction to the @ai-triage comment to show it was processed + gh api repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions \ + --method POST \ + --field content='rocket' || echo "Could not add reaction (non-critical)" \ No newline at end of file diff --git a/.github/workflows/ptp-nightly-failure-detector-old.yaml b/.github/workflows/ptp-nightly-failure-detector-old.yaml new file mode 100644 index 000000000..9697ba430 --- /dev/null +++ b/.github/workflows/ptp-nightly-failure-detector-old.yaml @@ -0,0 +1,330 @@ +name: PTP Nightly Failure Detector + +on: + schedule: + # Run every day at 8 AM EST (1 PM UTC) to check for new failures + - cron: '0 13 * * *' + workflow_dispatch: + inputs: + openshift_version: + description: 'OpenShift version to check (e.g., 4.21, 4.22) or "main" for latest' + required: false + default: 'main' + type: string + lookback_hours: + description: 'Hours to look back for failures' + required: false + default: '24' + type: string + +env: + DEFAULT_OPENSHIFT_VERSION: "main" + DEFAULT_LOOKBACK_HOURS: "24" + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + detect-failures: + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/upstream-ci' + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up environment variables + run: | + echo "OPENSHIFT_VERSION=${{ github.event.inputs.openshift_version || env.DEFAULT_OPENSHIFT_VERSION }}" >> $GITHUB_ENV + echo "LOOKBACK_HOURS=${{ github.event.inputs.lookback_hours || env.DEFAULT_LOOKBACK_HOURS }}" >> $GITHUB_ENV + echo "START_TIME=$(date -u -d "${{ github.event.inputs.lookback_hours || env.DEFAULT_LOOKBACK_HOURS }} hours ago" +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_ENV + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y jq curl + + - name: Make failure detection script executable + run: | + # Use the existing script file instead of creating inline + chmod +x ptp_failure_detector.sh + echo "Script already exists in repository, using it directly" + #!/bin/bash + set -e + + OPENSHIFT_VERSION="${OPENSHIFT_VERSION:-main}" + LOOKBACK_HOURS="${LOOKBACK_HOURS:-24}" + START_TIME="${START_TIME:-$(date -u -d "${LOOKBACK_HOURS} hours ago" +%Y-%m-%dT%H:%M:%SZ)}" + + echo "๐Ÿ” Checking for PTP test failures since: $START_TIME" + echo "๐Ÿ“… OpenShift version: $OPENSHIFT_VERSION" + + # Prow API endpoints for OpenShift CI + PROW_API_BASE="https://prow.ci.openshift.org" + + # Function to check job status and fetch artifacts + check_ptp_job() { + local job_name="$1" + echo "๐Ÿ”Ž Checking job: $job_name" + + # Use a more targeted approach - check specific job runs via search + # First, let's try the search API for recent failures + local search_url="${PROW_API_BASE}/?job=${job_name}" + + echo " ๐Ÿ” Searching for recent runs of: $job_name" + + # Try to find recent job runs by checking the job's recent history + # We'll simulate this by checking if any failures exist for this job pattern + # For now, let's create a mock response to test the workflow + + # In a real implementation, we would: + # 1. Query the GCS bucket for recent job runs + # 2. Check their status files + # 3. Look for failure indicators + + # For testing purposes, let's assume no failures found + echo "โœ… No recent failures found for: $job_name" + echo " (Note: This is a simplified check - full implementation would query GCS bucket)" + + return 1 # No failures found (for testing) + } + + # Function to fetch and analyze job artifacts + fetch_job_artifacts() { + local job_run="$1" + local job_url="$2" + + if [[ "$job_url" == "N/A" ]] || [[ -z "$job_url" ]]; then + echo " โ„น๏ธ No artifacts URL available" + return + fi + + # Extract artifacts URL pattern + local artifacts_url="${job_url}/artifacts" + echo " ๐Ÿ” Checking artifacts: $artifacts_url" + + # Try to fetch artifacts listing + artifacts_content=$(curl -s "$artifacts_url" 2>/dev/null || echo "") + + if [[ -n "$artifacts_content" ]]; then + # Look for common failure indicators in artifacts + analyze_artifacts "$artifacts_content" "$artifacts_url" + else + echo " โš ๏ธ Could not fetch artifacts" + fi + } + + # Function to analyze artifacts for PTP-specific failures + analyze_artifacts() { + local artifacts_content="$1" + local artifacts_url="$2" + + # Look for junit XML files or logs + echo "$artifacts_content" | grep -o 'href="[^"]*\(junit\|\.xml\|\.log\)"' | sed 's/href="//;s/"//' | while read -r artifact_path; do + if [[ -n "$artifact_path" ]]; then + local full_artifact_url="${artifacts_url}/${artifact_path}" + echo " ๐Ÿ“„ Analyzing: $artifact_path" + + # Download and analyze the artifact + artifact_content=$(curl -s "$full_artifact_url" 2>/dev/null || echo "") + + if [[ -n "$artifact_content" ]]; then + analyze_artifact_content "$artifact_content" "$artifact_path" + fi + fi + done + } + + # Function to analyze artifact content for PTP failures + analyze_artifact_content() { + local content="$1" + local artifact_name="$2" + + # Check for PTP-specific failures (ignoring platform failures) + if echo "$content" | grep -qi "ptp\|precision time protocol"; then + echo " ๐Ÿ“Š PTP-related content found in $artifact_name" + + # Look for specific failure patterns + if echo "$content" | grep -q "FAIL\|ERROR\|TIMEOUT"; then + # Extract failure details but ignore platform failures + echo "$content" | grep -i "fail\|error\|timeout" | grep -v -i "platform\|infrastructure\|network.*unreachable" | head -5 | while read -r line; do + if [[ -n "$line" ]]; then + echo " ๐Ÿšจ $line" + fi + done + fi + + # Look for specific PTP error patterns + if echo "$content" | grep -q "ptp4l\|phc2sys\|clock"; then + echo "$content" | grep -i "ptp4l\|phc2sys\|clock.*error\|time.*sync.*fail" | head -3 | while read -r line; do + if [[ -n "$line" ]]; then + echo " โฐ PTP Issue: $line" + fi + done + fi + fi + } + + # Main execution + echo "๐Ÿš€ Starting PTP failure detection..." + + # Set the actual OpenShift version to use + if [[ "$OPENSHIFT_VERSION" == "main" ]]; then + # Use the latest known OpenShift version when "main" is specified + ACTUAL_VERSION="4.21" + echo "๐Ÿ”„ Converting 'main' to latest version: $ACTUAL_VERSION" + else + ACTUAL_VERSION="$OPENSHIFT_VERSION" + fi + + # List of PTP-related jobs to monitor + PTP_JOBS=( + "periodic-ci-openshift-release-master-nightly-${ACTUAL_VERSION}-e2e-telco5g-ptp-upstream" + "e2e-telco5g-ptp-upstream" + "periodic-ci-openshift-kni-cnf-features-deploy-release-${ACTUAL_VERSION}-e2e-telco5g-ptp" + "e2e-telco5g-ptp" + ) + + failure_count=0 + detected_failures="" + for job in "${PTP_JOBS[@]}"; do + echo "=========================================" + job_output=$(check_ptp_job "$job" 2>&1) + job_exit_code=$? + echo "$job_output" + + # Count failures if any detected (exit code 0 means failure found) + if [[ $job_exit_code -eq 0 ]] && echo "$job_output" | grep -q "โŒ FAILURE DETECTED:"; then + job_failure_count=$(echo "$job_output" | grep -c "โŒ FAILURE DETECTED:" || echo "0") + failure_count=$((failure_count + job_failure_count)) + detected_failures="${detected_failures}\n${job_output}" + fi + done + + echo "=========================================" + echo "โœ… Failure detection completed" + echo "๐Ÿ“Š Total failures found: $failure_count" + + # Set output for GitHub Actions + echo "failure_count=$failure_count" >> $GITHUB_OUTPUT + echo "check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_OUTPUT + + # Save detected failures for issue creation + if [[ $failure_count -gt 0 ]]; then + echo -e "$detected_failures" > detected_failures.txt + fi + EOF + + chmod +x ptp_failure_detector.sh + + - name: Run failure detection + id: detect + run: ./ptp_failure_detector.sh + + - name: Create failure report + if: steps.detect.outputs.failure_count > 0 + run: | + cat > failure_report.md << EOF + # ๐Ÿšจ PTP Nightly Test Failures Detected + + **Detection Time:** ${{ steps.detect.outputs.check_time }} + **OpenShift Version:** ${{ env.OPENSHIFT_VERSION }} + **Failures Found:** ${{ steps.detect.outputs.failure_count }} + **Lookback Period:** ${{ env.LOOKBACK_HOURS }} hours + + ## ๐Ÿ“‹ Summary + + Automated failure detection found ${{ steps.detect.outputs.failure_count }} PTP-related test failures in the nightly runs for OpenShift ${{ env.OPENSHIFT_VERSION }}. + + ## ๐Ÿšจ Detected Failures + + \`\`\` + $(cat detected_failures.txt 2>/dev/null || echo "No detailed failure logs available") + \`\`\` + + ## ๐Ÿ” Investigation Required + + Please review the job failures and artifacts to identify: + - PTP configuration issues + - Hardware/driver problems + - Test environment issues + - Code regressions + + **Note:** Platform failures and infrastructure issues are filtered out from this report. + + ## ๐Ÿ”— Useful Links + + - [Prow Dashboard](https://prow.ci.openshift.org/?job=**e2e-telco5g-ptp**) + - [PTP Operator Repository](https://github.com/k8snetworkplumbingwg/ptp-operator) + - [OpenShift PTP Documentation](https://docs.openshift.com/container-platform/${{ env.OPENSHIFT_VERSION }}/networking/using-ptp.html) + + ## ๐Ÿค– AI Analysis Available + + To get AI-powered analysis of these failures, comment \`@ai-triage\` on this issue. + + --- + *Generated by PTP Nightly Failure Detector on $(date -u)* + EOF + + - name: Check if issue already exists + if: steps.detect.outputs.failure_count > 0 + id: check_issue + run: | + # Check if there's already an open issue for today's failures + issue_title="PTP Nightly Failures - $(date -u +%Y-%m-%d) - OpenShift ${{ env.OPENSHIFT_VERSION }}" + + # Use GitHub CLI to search for existing issues + existing_issue=$(gh issue list --search "$issue_title" --state open --json number,title --jq '.[0].number // empty' || echo "") + + echo "issue_exists=${existing_issue}" >> $GITHUB_OUTPUT + echo "issue_title=$issue_title" >> $GITHUB_OUTPUT + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Create or update GitHub issue + if: steps.detect.outputs.failure_count > 0 + run: | + if [[ -n "${{ steps.check_issue.outputs.issue_exists }}" ]]; then + echo "๐Ÿ“ Updating existing issue #${{ steps.check_issue.outputs.issue_exists }}" + gh issue comment ${{ steps.check_issue.outputs.issue_exists }} --body-file failure_report.md + else + echo "๐Ÿ“ Creating new GitHub issue" + gh issue create \ + --title "${{ steps.check_issue.outputs.issue_title }}" \ + --body-file failure_report.md \ + --label "bug,ptp,nightly-failure,needs-investigation" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Post summary + run: | + if [[ "${{ steps.detect.outputs.failure_count }}" -gt "0" ]]; then + echo "## ๐Ÿšจ PTP Failure Detection Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Failures detected:** ${{ steps.detect.outputs.failure_count }}" >> $GITHUB_STEP_SUMMARY + echo "- **OpenShift version:** ${{ env.OPENSHIFT_VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "- **Time range:** Last ${{ env.LOOKBACK_HOURS }} hours" >> $GITHUB_STEP_SUMMARY + echo "- **GitHub issue:** Created/Updated" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "See the created GitHub issue for detailed failure analysis." >> $GITHUB_STEP_SUMMARY + else + echo "## โœ… PTP Status Check" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "No PTP test failures detected in the last ${{ env.LOOKBACK_HOURS }} hours for OpenShift ${{ env.OPENSHIFT_VERSION }}." >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: ptp-failure-detection-${{ github.run_number }} + path: | + ptp_failure_detector.sh + failure_report.md + retention-days: 30 \ No newline at end of file diff --git a/.github/workflows/ptp-nightly-failure-detector.yaml b/.github/workflows/ptp-nightly-failure-detector.yaml new file mode 100644 index 000000000..7011c10db --- /dev/null +++ b/.github/workflows/ptp-nightly-failure-detector.yaml @@ -0,0 +1,164 @@ +name: PTP Nightly Failure Detector + +on: + schedule: + # Run every day at 8 AM EST (1 PM UTC) to check for new failures + - cron: '0 13 * * *' + workflow_dispatch: + inputs: + openshift_version: + description: 'OpenShift version to check (e.g., 4.21, 4.22) or "main" for latest' + required: false + default: 'main' + type: string + lookback_hours: + description: 'Hours to look back for failures' + required: false + default: '24' + type: string + +env: + DEFAULT_OPENSHIFT_VERSION: "main" + DEFAULT_LOOKBACK_HOURS: "24" + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + detect-failures: + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/upstream-ci' + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up environment variables + run: | + echo "OPENSHIFT_VERSION=${{ github.event.inputs.openshift_version || env.DEFAULT_OPENSHIFT_VERSION }}" >> $GITHUB_ENV + echo "LOOKBACK_HOURS=${{ github.event.inputs.lookback_hours || env.DEFAULT_LOOKBACK_HOURS }}" >> $GITHUB_ENV + echo "START_TIME=$(date -u -d "${{ github.event.inputs.lookback_hours || env.DEFAULT_LOOKBACK_HOURS }} hours ago" +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_ENV + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y jq curl + + - name: Make failure detection script executable + run: | + # Use the existing script file from repository + chmod +x ptp_failure_detector.sh + echo "Using existing ptp_failure_detector.sh script from repository" + + - name: Run failure detection + id: detect + run: ./ptp_failure_detector.sh + + - name: Create failure report + if: steps.detect.outputs.failure_count > 0 + run: | + cat > failure_report.md << EOF + # ๐Ÿšจ PTP Nightly Test Failures Detected + + **Detection Time:** ${{ steps.detect.outputs.check_time }} + **OpenShift Version:** ${{ env.OPENSHIFT_VERSION }} + **Failures Found:** ${{ steps.detect.outputs.failure_count }} + **Lookback Period:** ${{ env.LOOKBACK_HOURS }} hours + + ## ๐Ÿ“‹ Summary + + Automated failure detection found ${{ steps.detect.outputs.failure_count }} PTP-related test failures in the nightly runs for OpenShift ${{ env.OPENSHIFT_VERSION }}. + + ## ๐Ÿšจ Detected Failures + + \`\`\` + $(cat detected_failures.txt 2>/dev/null || echo "No detailed failure logs available") + \`\`\` + + ## ๐Ÿ” Investigation Required + + Please review the job failures and artifacts to identify: + - PTP configuration issues + - Hardware/driver problems + - Test environment issues + - Code regressions + + **Note:** Platform failures and infrastructure issues are filtered out from this report. + + ## ๐Ÿ”— Useful Links + + - [Prow Dashboard](https://prow.ci.openshift.org/?job=**e2e-telco5g-ptp**) + - [PTP Operator Repository](https://github.com/k8snetworkplumbingwg/ptp-operator) + - [OpenShift PTP Documentation](https://docs.openshift.com/container-platform/${{ env.OPENSHIFT_VERSION }}/networking/using-ptp.html) + + ## ๐Ÿค– AI Analysis Available + + To get AI-powered analysis of these failures, comment \`@ai-triage\` on this issue. + + --- + *Generated by PTP Nightly Failure Detector on $(date -u)* + EOF + + - name: Check if issue already exists + if: steps.detect.outputs.failure_count > 0 + id: check_issue + run: | + # Check if there's already an open issue for today's failures + issue_title="PTP Nightly Failures - $(date -u +%Y-%m-%d) - OpenShift ${{ env.OPENSHIFT_VERSION }}" + + # Use GitHub CLI to search for existing issues + existing_issue=$(gh issue list --search "$issue_title" --state open --json number,title --jq '.[0].number // empty' || echo "") + + echo "issue_exists=${existing_issue}" >> $GITHUB_OUTPUT + echo "issue_title=$issue_title" >> $GITHUB_OUTPUT + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Create or update GitHub issue + if: steps.detect.outputs.failure_count > 0 + run: | + if [[ -n "${{ steps.check_issue.outputs.issue_exists }}" ]]; then + echo "๐Ÿ“ Updating existing issue #${{ steps.check_issue.outputs.issue_exists }}" + gh issue comment ${{ steps.check_issue.outputs.issue_exists }} --body-file failure_report.md + else + echo "๐Ÿ“ Creating new GitHub issue" + gh issue create \ + --title "${{ steps.check_issue.outputs.issue_title }}" \ + --body-file failure_report.md \ + --label "bug,ptp,nightly-failure,needs-investigation" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Post summary + run: | + if [[ "${{ steps.detect.outputs.failure_count }}" -gt "0" ]]; then + echo "## ๐Ÿšจ PTP Failure Detection Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Failures detected:** ${{ steps.detect.outputs.failure_count }}" >> $GITHUB_STEP_SUMMARY + echo "- **OpenShift version:** ${{ env.OPENSHIFT_VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "- **Time range:** Last ${{ env.LOOKBACK_HOURS }} hours" >> $GITHUB_STEP_SUMMARY + echo "- **GitHub issue:** Created/Updated" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "See the created GitHub issue for detailed failure analysis." >> $GITHUB_STEP_SUMMARY + else + echo "## โœ… PTP Status Check" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "No PTP test failures detected in the last ${{ env.LOOKBACK_HOURS }} hours for OpenShift ${{ env.OPENSHIFT_VERSION }}." >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: ptp-failure-detection-${{ github.run_number }} + path: | + ptp_failure_detector.sh + failure_report.md + detected_failures.txt + retention-days: 30 \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..7285c0983 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +- i am running from my fork upstream-ci \ No newline at end of file diff --git a/detected_failures.txt b/detected_failures.txt new file mode 100644 index 000000000..1963d215b --- /dev/null +++ b/detected_failures.txt @@ -0,0 +1,13 @@ + +๐Ÿ”Ž Checking job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + ๐Ÿ” [TEST MODE] Simulating failure detection for workflow testing +โŒ FAILURE DETECTED (TEST MODE): + Job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + Time: 2025-09-29T17:53:52Z + State: failure + URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888 + ๐Ÿ“„ [TEST MODE] Simulating artifact analysis + ๐Ÿšจ Mock PTP test failure: Ginkgo test 'should synchronize time across PTP pods' failed + โฐ PTP Issue: ptp4l synchronization timeout after 300 seconds + ๐Ÿ“Š GCS Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/ +--- diff --git a/docs/ai-powered-ci-failure-fixes.md b/docs/ai-powered-ci-failure-fixes.md new file mode 100644 index 000000000..469a7a949 --- /dev/null +++ b/docs/ai-powered-ci-failure-fixes.md @@ -0,0 +1,513 @@ +# AI-Powered CI Failure Detection and Automated Fixes + +## Overview + +This document outlines the implementation plan for an AI-powered system that automatically detects, analyzes, and proposes fixes for CI failures in the PTP Operator project, inspired by Red Hat's CVE automation approach but adapted for CI/CD pipeline failures. + +## Current State + +We have an **PTP Nightly Failure Detector** GitHub Action that: +- **Workflow File**: `.github/workflows/ptp-nightly-failure-detector.yaml` (ready for deployment) +- **Functionality**: Runs every 6 hours to detect PTP test failures +- **Issue Creation**: Automatically creates GitHub issues when failures are detected +- **Analysis**: Provides detailed failure analysis with artifact inspection +- **Integration Ready**: Includes AI analysis trigger support (`@ai-triage` comments) + +## Proposed Enhancement: AI-Powered Failure Resolution + +### Core Architecture + +``` +GitHub Actions (Agent) โ†โ†’ Gemini/Claude CLI (AI Analysis) โ†โ†’ GitHub MCP Server (Repository Actions) +``` + +### Key Components + +#### 1. **GitHub Actions (Agent)** +- Extends existing failure detector workflow +- Triggers AI analysis when failures are detected +- Orchestrates the fix proposal and review process +- Manages branch creation and PR submission + +#### 2. **Gemini CLI with ReAct Loop (`run-gemini-cli`)** +- **GitHub Actions Integration**: The `run-gemini-cli` action integrates Gemini CLI into development workflow +- **Autonomous Agent**: Acts as an autonomous agent for performing comprehensive code analysis +- **ReAct (Reason and Act) Loop**: Uses reasoning and action cycles with built-in tools and MCP servers +- **Complex Use Case Handling**: Specialized for reading code, analyzing dependencies, and fixing bugs +- **Gemini API Integration**: Leverages Gemini API's advanced capabilities for intelligent analysis +- **Cross-Repository Analysis**: Deep failure analysis across all three PTP repositories +- **Context-Aware**: Understanding of PTP ecosystem architecture and interdependencies + +#### 3. **GitHub MCP Server** +- Provides AI agent access to repository operations +- Enables reading files, creating branches, and updating code +- Manages issue comments and PR creation +- Controlled access to prevent unauthorized changes + +## Implementation Workflow + +### Stage 1: Enhanced Failure Detection +**Trigger**: Issue creation in repository (automatically or manually created) + +**PTP Ginkgo Test Analysis Context**: +1. **Primary Focus**: PTP Operator Ginkgo test failures from `e2e-telco5g-ptp-upstream` job +2. **Repository**: `k8snetworkplumbingwg/ptp-operator` (main focus for test failures) +3. **Supporting Repositories** (for context when needed): + - `k8snetworkplumbingwg/linuxptp-daemon` (underlying PTP implementation) + - `redhat-cne/cloud-event-proxy` (event handling integration) + +4. **Ginkgo Test Failure Analysis**: + - **Prow Job Monitoring**: Focus on `e2e-telco5g-ptp-upstream` job failures only + - **Artifact Deep Dive**: Parse JUnit XML and test logs from specific artifact paths + - **PTP Test Classification**: Distinguish PTP test failures from platform/infrastructure issues + - **Ginkgo Output Parsing**: Extract specific test case failures and error messages + - **Historical Pattern Recognition**: Identify recurring PTP test failure patterns + +### Stage 2: Automated Triage (`@ai-triage`) +**Trigger**: Comment `@ai-triage` on failure issue + +**Process**: +```yaml +- name: AI PTP Ginkgo Test Analysis + prompt: |- + You are a PTP test engineer analyzing Ginkgo test failures from the ptp-operator repository. + + Test Context: + - Repository: k8snetworkplumbingwg/ptp-operator + - Test Framework: Ginkgo tests for PTP functionality + - Target Job: e2e-telco5g-ptp-upstream + - Prow URL Pattern: https://prow.ci.openshift.org/?job=**e2e-telco5g-ptp-upstream** + + Failure Analysis Focus: + - ONLY analyze jobs with state: "failure" + - IGNORE all platform failures and infrastructure issues + - Focus on PTP-specific test failures in Ginkgo test suite + + Artifacts Location Pattern: + - Job URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-*-e2e-telco5g-ptp-upstream/{JOB_ID} + - Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-*-e2e-telco5g-ptp-upstream/{JOB_ID}/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/ + + TASK: Analyze PTP test artifacts and identify Ginkgo test failures. + + Required Analysis Steps: + 1. **Artifact Inspection**: Examine JUnit XML and test logs from artifacts directory + 2. **Ginkgo Test Parsing**: Parse Ginkgo test output for specific PTP test failures + 3. **PTP Error Classification**: Identify PTP-specific vs platform issues + + Analysis must include: + 1. **Test Failure Summary** - Which specific Ginkgo tests failed? + 2. **PTP Root Cause** - What PTP functionality is broken (ptp4l, phc2sys, sync, config)? + 3. **Failure Classification** - Is this a test case issue or actual PTP operator bug? + 4. **Proposed Fix Location** - Fix needed in ptp-operator repository: + - Test case fix: Update/fix the failing Ginkgo test + - Operator bug fix: Fix actual PTP operator functionality + 5. **Test Reproduction** - How to reproduce and verify the fix +``` + +### Stage 3: Automated Fix Creation (`@ai-create-fix`) +**Trigger**: Comment `@ai-create-fix` after triage approval + +**Process**: +```yaml +- name: AI PTP Operator Fix Implementation + prompt: |- + You are implementing a fix for PTP Ginkgo test failures in the ptp-operator repository. + + Repository Context: + - Focus: k8snetworkplumbingwg/ptp-operator + - Test Framework: Ginkgo test suite + - Failed Job: e2e-telco5g-ptp-upstream + + TASK: Create fix branch for issue #${{ env.ISSUE_NUMBER }} + + STEP 1 - PARSE TRIAGE: Extract PTP test failure classification from triage analysis + STEP 2 - CREATE BRANCH: Branch name: ptp-fix-issue-${{ env.ISSUE_NUMBER }}-${{ github.run_number }} + STEP 3 - APPLY FIX: Choose appropriate fix in ptp-operator repository: + - If test case issue: Fix/update the failing Ginkgo test cases + - If operator bug: Fix the actual PTP operator functionality/code + STEP 4 - VALIDATE: Ensure changes follow PTP operator patterns and test best practices + STEP 5 - REPORT: Comment with fix summary and e2e-telco5g-ptp-upstream validation steps +``` + +## MCP Tools Usage by Stage + +### Analysis Stage (PTP Ginkgo Test Focus) +- `get_issue` - Read PTP test failure issue details +- `add_issue_comment` - Post Ginkgo test analysis results +- `get_file_contents` - Examine PTP operator source files and test files +- `search_code` - Find related code patterns in ptp-operator repository +- `web_fetch` - Retrieve artifacts from Prow/GCS (JUnit XML, test logs) +- `list_issues` - Check for related PTP test failure issues + +### Fix Creation Stage (PTP Operator Focus) +- `create_branch` - Create fix branch in ptp-operator repository +- `create_or_update_file` - Apply code/test changes in ptp-operator +- `search_code` - Validate fix completeness in PTP operator codebase +- `add_issue_comment` - Report fix completion with Ginkgo test validation +- `create_pull_request` - Submit PR to ptp-operator repository +- `get_file_contents` - Reference supporting repos for context when needed + +## Implementation Plan + +### Phase 1: Foundation (Week 1-2) +- [ ] Set up AI CLI integration in GitHub Actions (triggers on issue creation) +- [ ] Configure GitHub MCP server access to all three repositories: + - `k8snetworkplumbingwg/ptp-operator` + - `k8snetworkplumbingwg/linuxptp-daemon` + - `redhat-cne/cloud-event-proxy` +- [ ] Create cross-repository analysis prompts with interdependency context +- [ ] Test with historical failure data across all three repos + +### Phase 2: Core Features (Week 3-4) +- [ ] Implement automated triage workflow +- [ ] Develop fix generation capabilities +- [ ] Create approval gates and safety checks +- [ ] Add comprehensive logging and monitoring + +### Phase 3: Enhancement (Week 5-6) +- [ ] Historical failure pattern learning +- [ ] Multi-fix proposal capability +- [ ] Integration with existing review processes +- [ ] Performance optimization and error handling + +### Phase 4: Production (Week 7-8) +- [ ] Team training and documentation +- [ ] Gradual rollout with manual oversight +- [ ] Feedback collection and refinement +- [ ] Full automation with safety controls + +## Safety and Review Process + +### Automated Safeguards +1. **Dry Run Mode**: AI proposes fixes without applying them +2. **Code Review Gates**: All AI fixes require human approval +3. **Test Validation**: Fixes must pass existing test suites +4. **Rollback Capability**: Easy reversion of AI-generated changes + +### Human Oversight Points +1. **Triage Approval**: Human review before fix generation +2. **Code Review**: Standard PR review process for all changes +3. **Testing Validation**: Manual testing of critical fixes +4. **Emergency Override**: Ability to disable AI system + +## Success Metrics + +### Efficiency Gains +- **Time to Detection**: Reduce from hours to minutes +- **Analysis Time**: Reduce from 2-3 hours to 15-30 minutes +- **Fix Development**: Reduce from days to hours +- **Overall Resolution**: Target 50% reduction in failure resolution time + +### Quality Metrics +- **Fix Success Rate**: Target 80% of AI fixes resolve the issue +- **False Positive Rate**: Keep under 10% +- **Regression Prevention**: No new issues introduced by AI fixes + +## Repository-Specific Context + +### PTP Operator Failure Patterns +```yaml +context_prompts: + timing_issues: "PTP synchronization often fails due to timing precision requirements" + hardware_deps: "Tests may fail on virtualized environments lacking PTP hardware" + config_errors: "Common misconfigurations in PTP4L and PHC2SYS settings" + race_conditions: "Multi-pod PTP configurations can have startup race conditions" +``` + +### Common Fix Categories +1. **Timeout Adjustments**: Increase wait times for PTP sync +2. **Configuration Updates**: Fix PTP daemon configurations +3. **Test Environment**: Add hardware requirement checks +4. **Error Handling**: Improve error detection and recovery + +## Security and Secret Management + +### Protecting API Keys in Upstream Repository + +When running on upstream repositories, protecting `GEMINI_API_KEY` is critical: + +#### **Option 1: Organization-Level Secrets (Recommended)** +```yaml +# Repository Settings > Secrets and variables > Actions +# Set as Organization secret with repository access control +secrets.GEMINI_API_KEY # Available only to authorized repositories +``` + +#### **Option 2: Environment-Based Protection** +```yaml +jobs: + ai-analysis: + environment: ai-production # Requires approval for sensitive operations + if: | + github.repository_owner == 'k8snetworkplumbingwg' && + (github.event_name == 'issues' || github.event_name == 'issue_comment') +``` + +#### **Option 3: Fork-Safe Configuration** +```yaml +- name: Check for API Key + id: check-key + run: | + if [[ -z "${{ secrets.GEMINI_API_KEY }}" ]]; then + echo "api-available=false" >> $GITHUB_OUTPUT + echo "โš ๏ธ Gemini API key not available - skipping AI analysis" + else + echo "api-available=true" >> $GITHUB_OUTPUT + fi + +- name: Run Gemini CLI (Only if API key available) + if: steps.check-key.outputs.api-available == 'true' + uses: ./.github/actions/run-gemini-cli +``` + +#### **Option 4: External Service Integration** +```yaml +# Use a separate service/webhook for AI processing +- name: Trigger External AI Service + run: | + curl -X POST "${{ secrets.AI_SERVICE_WEBHOOK_URL }}" \ + -H "Authorization: Bearer ${{ secrets.AI_SERVICE_TOKEN }}" \ + -d '{ + "repository": "${{ github.repository }}", + "issue": "${{ github.event.issue.number }}", + "action": "${{ github.event.action }}" + }' +``` + +### Additional Security Measures + +#### **Workflow Security Controls** +```yaml +permissions: + contents: read # Minimal read access + issues: write # Only for commenting on issues + pull-requests: write # Only for creating PRs + # No secrets, packages, or actions permissions + +concurrency: + group: ai-analysis-${{ github.event.issue.number }} + cancel-in-progress: true # Prevent multiple runs +``` + +#### **Repository Protection Rules** +- **Branch Protection**: Require reviews for AI-generated PRs +- **Fork Restrictions**: Limit workflow execution on forks +- **Approval Gates**: Require maintainer approval for sensitive operations + +## Risk Mitigation + +### Security Risks +- **API Key Exposure**: Use organization secrets with access controls +- **Fork Attacks**: Implement fork-safe workflows with key availability checks +- **Unauthorized Access**: Restrict workflow triggers to repository owners only +- **Secret Leakage**: Never log or expose API keys in workflow outputs + +### Technical Risks +- **AI Hallucination**: Multiple validation layers and human review +- **Code Quality**: Enforce coding standards and test coverage +- **Limited Scope**: AI changes restricted to specific file patterns + +### Process Risks +- **Over-automation**: Maintain human oversight and control +- **Team Skills**: Ensure team understands AI-generated fixes +- **Dependency Risk**: Have manual fallback procedures + +## Future Enhancements + +### Advanced Features +- **Predictive Failure Detection**: Identify issues before they cause failures +- **Cross-Repository Learning**: Share patterns across related projects +- **Performance Optimization**: AI-driven performance improvements +- **Documentation Generation**: Auto-update docs based on fixes + +### Integration Opportunities +- **Slack/Teams Integration**: Real-time notifications and approvals +- **Jira Integration**: Automatic ticket creation and updates +- **Monitoring Integration**: Proactive failure prevention +- **Release Pipeline**: Integration with automated releases + +## Getting Started + +### Prerequisites +1. **Gemini CLI** - AI inference engine with ReAct (Reason and Act) loop capabilities +2. **GitHub MCP server** - Model Context Protocol server for repository operations +3. **Multi-repository access permissions**: + - `k8snetworkplumbingwg/ptp-operator` (read/write) + - `k8snetworkplumbingwg/linuxptp-daemon` (read/write) + - `redhat-cne/cloud-event-proxy` (read/write) +4. **Team training** on AI workflow and cross-repository dependencies + +### Initial Setup +```bash +# 1. Install Gemini CLI with ReAct capabilities +pip install gemini-cli + +# 2. Configure GitHub MCP with multi-repo access +npm install @modelcontextprotocol/server-github + +# 3. Setup GitHub Actions secrets for cross-repository access +# - GEMINI_API_KEY (Gemini API access) +# - GITHUB_TOKEN (with repo access to all three repositories) +# - PTP_OPERATOR_TOKEN (if separate token needed) +# - LINUXPTP_DAEMON_TOKEN (if separate token needed) +# - CLOUD_EVENT_PROXY_TOKEN (if separate token needed) + +# 4. Deploy enhanced workflow with issue creation trigger +cp .github/workflows/ai-failure-detector.yml .github/workflows/ + +# 5. Configure cross-repository webhooks for issue creation triggers +``` + +### Secure Workflow Configuration +```yaml +name: AI-Powered PTP Failure Analysis +on: + issues: + types: [opened, labeled] + issue_comment: + types: [created] + workflow_dispatch: + +# Security: Minimal permissions +permissions: + contents: read + issues: write + pull-requests: write + +env: + PTP_OPERATOR_REPO: "k8snetworkplumbingwg/ptp-operator" + LINUXPTP_DAEMON_REPO: "k8snetworkplumbingwg/linuxptp-daemon" + CLOUD_EVENT_PROXY_REPO: "redhat-cne/cloud-event-proxy" + +# Security: Prevent concurrent runs per issue +concurrency: + group: ai-analysis-${{ github.event.issue.number }} + cancel-in-progress: true + +jobs: + ai-analysis: + # Security: Only run on upstream repository + if: | + github.repository_owner == 'k8snetworkplumbingwg' && + ( + (github.event.action == 'opened' && contains(github.event.issue.title, 'PTP')) || + (github.event.action == 'created' && contains(github.event.comment.body, '@ai-triage')) || + (github.event.action == 'created' && contains(github.event.comment.body, '@ai-create-fix')) + ) + runs-on: ubuntu-latest + environment: ai-production # Requires approval for production AI operations + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Check API Key Availability + id: check-key + run: | + if [[ -z "${{ secrets.GEMINI_API_KEY }}" ]]; then + echo "api-available=false" >> $GITHUB_OUTPUT + echo "โš ๏ธ Gemini API key not available - AI analysis will be skipped" + echo "This is expected on forks. For upstream maintainers, please configure organization secrets." + else + echo "api-available=true" >> $GITHUB_OUTPUT + echo "โœ… Gemini API key available - proceeding with AI analysis" + fi + + - name: Run Gemini CLI Autonomous Agent + if: steps.check-key.outputs.api-available == 'true' + uses: ./.github/actions/run-gemini-cli + with: + api-key: ${{ secrets.GEMINI_API_KEY }} + repositories: "$PTP_OPERATOR_REPO,$LINUXPTP_DAEMON_REPO,$CLOUD_EVENT_PROXY_REPO" + issue-number: ${{ github.event.issue.number }} + trigger-type: ${{ github.event.action }} + mcp-server: "github" + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Fallback for Contributors + if: steps.check-key.outputs.api-available == 'false' + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: `๐Ÿค– **AI Analysis Not Available** + + AI-powered failure analysis is only available on the upstream repository with proper API key configuration. + + **For maintainers**: Please ensure \`GEMINI_API_KEY\` is configured as an organization secret. + **For contributors**: A maintainer will need to manually trigger AI analysis or review your issue. + + You can still use the existing [PTP Nightly Failure Detector](https://github.com/k8snetworkplumbingwg/ptp-operator-k8/actions/workflows/ptp-nightly-failure-detector.yaml) for basic failure detection.` + }); +``` + +### Gemini CLI Action Architecture + +The `run-gemini-cli` GitHub Action provides the core intelligence for the AI-powered failure analysis: + +```yaml +# .github/actions/run-gemini-cli/action.yml +name: 'Run Gemini CLI Autonomous Agent' +description: 'Integrates Gemini CLI into development workflow for code analysis and bug fixing' + +inputs: + api-key: + description: 'Gemini API key for LLM access' + required: true + repositories: + description: 'Comma-separated list of repositories to analyze' + required: true + issue-number: + description: 'GitHub issue number to analyze' + required: true + trigger-type: + description: 'Type of trigger (opened, created, etc.)' + required: true + mcp-server: + description: 'MCP server type (github)' + required: true + default: 'github' + github-token: + description: 'GitHub token for repository access' + required: true + +runs: + using: 'composite' + steps: + - name: Setup Gemini CLI + run: | + pip install gemini-cli + gemini-cli configure --api-key ${{ inputs.api-key }} + shell: bash + + - name: Execute ReAct Loop Analysis + run: | + gemini-cli react-loop \ + --task "analyze-ci-failure" \ + --repos ${{ inputs.repositories }} \ + --issue ${{ inputs.issue-number }} \ + --trigger ${{ inputs.trigger-type }} \ + --mcp-server ${{ inputs.mcp-server }} \ + --github-token ${{ inputs.github-token }} + shell: bash +``` + +### Team Training +1. **AI Workflow Overview**: Understanding the automated process +2. **Review Process**: How to evaluate AI-generated fixes +3. **Emergency Procedures**: Disabling AI when needed +4. **Feedback Loop**: Improving AI performance over time + +--- + +**Next Steps**: +1. Team review and approval of this implementation plan +2. Setup development environment for testing +3. Create pilot implementation with limited scope +4. Gradual rollout with extensive monitoring + +**Expected Timeline**: 8 weeks from approval to production deployment +**Resource Requirements**: 1-2 engineers, AI API access, additional GitHub Actions minutes \ No newline at end of file diff --git a/ptp-failure-detection-9/detected_failures.txt b/ptp-failure-detection-9/detected_failures.txt new file mode 100644 index 000000000..91b85c666 --- /dev/null +++ b/ptp-failure-detection-9/detected_failures.txt @@ -0,0 +1,13 @@ + +๐Ÿ”Ž Checking job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + ๐Ÿ” [TEST MODE] Simulating failure detection for workflow testing +โŒ FAILURE DETECTED (TEST MODE): + Job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + Time: 2025-09-29T17:54:26Z + State: failure + URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888 + ๐Ÿ“„ [TEST MODE] Simulating artifact analysis + ๐Ÿšจ Mock PTP test failure: Ginkgo test 'should synchronize time across PTP pods' failed + โฐ PTP Issue: ptp4l synchronization timeout after 300 seconds + ๐Ÿ“Š GCS Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/ +--- diff --git a/ptp-failure-detection-9/failure_report.md b/ptp-failure-detection-9/failure_report.md new file mode 100644 index 000000000..5f4dce19d --- /dev/null +++ b/ptp-failure-detection-9/failure_report.md @@ -0,0 +1,51 @@ +# ๐Ÿšจ PTP Nightly Test Failures Detected + +**Detection Time:** 2025-09-30T17:54:36Z +**OpenShift Version:** main +**Failures Found:** 1 +**Lookback Period:** 24 hours + +## ๐Ÿ“‹ Summary + +Automated failure detection found 1 PTP-related test failures in the nightly runs for OpenShift main. + +## ๐Ÿšจ Detected Failures + +``` + +๐Ÿ”Ž Checking job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + ๐Ÿ” [TEST MODE] Simulating failure detection for workflow testing +โŒ FAILURE DETECTED (TEST MODE): + Job: periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream + Time: 2025-09-29T17:54:26Z + State: failure + URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888 + ๐Ÿ“„ [TEST MODE] Simulating artifact analysis + ๐Ÿšจ Mock PTP test failure: Ginkgo test 'should synchronize time across PTP pods' failed + โฐ PTP Issue: ptp4l synchronization timeout after 300 seconds + ๐Ÿ“Š GCS Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream/1973002493642149888/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/ +--- +``` + +## ๐Ÿ” Investigation Required + +Please review the job failures and artifacts to identify: +- PTP configuration issues +- Hardware/driver problems +- Test environment issues +- Code regressions + +**Note:** Platform failures and infrastructure issues are filtered out from this report. + +## ๐Ÿ”— Useful Links + +- [Prow Dashboard](https://prow.ci.openshift.org/?job=**e2e-telco5g-ptp**) +- [PTP Operator Repository](https://github.com/k8snetworkplumbingwg/ptp-operator) +- [OpenShift PTP Documentation](https://docs.openshift.com/container-platform/main/networking/using-ptp.html) + +## ๐Ÿค– AI Analysis Available + +To get AI-powered analysis of these failures, comment `@ai-triage` on this issue. + +--- +*Generated by PTP Nightly Failure Detector on Tue Sep 30 17:54:36 UTC 2025* diff --git a/ptp-failure-detection-9/ptp_failure_detector.sh b/ptp-failure-detection-9/ptp_failure_detector.sh new file mode 100755 index 000000000..8960b6b64 --- /dev/null +++ b/ptp-failure-detection-9/ptp_failure_detector.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e +set -o pipefail + +OPENSHIFT_VERSION="${OPENSHIFT_VERSION:-main}" +LOOKBACK_HOURS="${LOOKBACK_HOURS:-24}" +START_TIME="${START_TIME:-$(date -u -d "${LOOKBACK_HOURS} hours ago" +%Y-%m-%dT%H:%M:%SZ)}" + +echo "๐Ÿ” Checking for PTP test failures since: $START_TIME" +echo "๐Ÿ“… OpenShift version: $OPENSHIFT_VERSION" + +# Prow API endpoints for OpenShift CI +PROW_API_BASE="https://prow.ci.openshift.org" + +# Function to check job status and fetch artifacts +check_ptp_job() { + local job_name="$1" + echo "๐Ÿ”Ž Checking job: $job_name" + + # SIMPLIFIED TEST MODE: Always simulate finding a failure for workflow testing + echo " ๐Ÿ” [TEST MODE] Simulating failure detection for workflow testing" + + # Always simulate a failure found to test the workflow + local mock_job_id="1973002493642149888" + local mock_url="https://prow.ci.openshift.org/view/gs/test-platform-results/logs/${job_name}/${mock_job_id}" + + echo "โŒ FAILURE DETECTED (TEST MODE):" + echo " Job: $job_name" + echo " Time: $START_TIME" + echo " State: failure" + echo " URL: $mock_url" + + # Simulate fetching artifacts + echo " ๐Ÿ“„ [TEST MODE] Simulating artifact analysis" + echo " ๐Ÿšจ Mock PTP test failure: Ginkgo test 'should synchronize time across PTP pods' failed" + echo " โฐ PTP Issue: ptp4l synchronization timeout after 300 seconds" + echo " ๐Ÿ“Š GCS Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_name}/${mock_job_id}/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/" + echo "---" + + return 0 # Always return success (failure found) +} + +# Function to fetch and analyze job artifacts +fetch_job_artifacts() { + local job_run="$1" + local job_url="$2" + + # Extract job ID from the job_run name or URL + # For job names like "periodic-ci-openshift-release-master-nightly-4.21-e2e-telco5g-ptp-upstream" + # We need to construct the GCS URL pattern + if [[ "$job_url" != "N/A" ]] && [[ -n "$job_url" ]]; then + # Try to extract job ID from Prow URL + local job_id=$(echo "$job_url" | grep -o '[0-9]\{19\}' | head -1) + + if [[ -n "$job_id" ]]; then + # Use the GCS URL pattern you provided + local gcs_artifacts_url="https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_run}/${job_id}/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/" + + echo " ๐Ÿ” Checking GCS artifacts: $gcs_artifacts_url" + + # Try to fetch artifacts listing from GCS + artifacts_content=$(curl -s "$gcs_artifacts_url" 2>/dev/null || echo "") + + if [[ -n "$artifacts_content" ]]; then + # Look for common failure indicators in artifacts + analyze_artifacts "$artifacts_content" "$gcs_artifacts_url" + else + echo " โš ๏ธ Could not fetch artifacts from GCS" + fi + else + echo " โš ๏ธ Could not extract job ID from URL: $job_url" + fi + else + echo " โ„น๏ธ No artifacts URL available" + fi +} + +# Function to analyze artifacts for PTP-specific failures +analyze_artifacts() { + local artifacts_content="$1" + local artifacts_url="$2" + + # Look for junit XML files or logs + echo "$artifacts_content" | grep -o 'href="[^"]*\(junit\|\.xml\|\.log\)"' | sed 's/href="//;s/"//' | while read -r artifact_path; do + if [[ -n "$artifact_path" ]]; then + local full_artifact_url="${artifacts_url}/${artifact_path}" + echo " ๐Ÿ“„ Analyzing: $artifact_path" + + # Download and analyze the artifact + artifact_content=$(curl -s "$full_artifact_url" 2>/dev/null || echo "") + + if [[ -n "$artifact_content" ]]; then + analyze_artifact_content "$artifact_content" "$artifact_path" + fi + fi + done +} + +# Function to analyze artifact content for PTP failures +analyze_artifact_content() { + local content="$1" + local artifact_name="$2" + + # Check for PTP-specific failures (ignoring platform failures) + if echo "$content" | grep -qi "ptp\|precision time protocol"; then + echo " ๐Ÿ“Š PTP-related content found in $artifact_name" + + # Look for specific failure patterns + if echo "$content" | grep -q "FAIL\|ERROR\|TIMEOUT"; then + # Extract failure details but ignore platform failures + echo "$content" | grep -i "fail\|error\|timeout" | grep -v -i "platform\|infrastructure\|network.*unreachable" | head -5 | while read -r line; do + if [[ -n "$line" ]]; then + echo " ๐Ÿšจ $line" + fi + done + fi + + # Look for specific PTP error patterns + if echo "$content" | grep -q "ptp4l\|phc2sys\|clock"; then + echo "$content" | grep -i "ptp4l\|phc2sys\|clock.*error\|time.*sync.*fail" | head -3 | while read -r line; do + if [[ -n "$line" ]]; then + echo " โฐ PTP Issue: $line" + fi + done + fi + fi +} + +# Main execution +echo "๐Ÿš€ Starting PTP failure detection..." + +# Set the actual OpenShift version to use +if [[ "$OPENSHIFT_VERSION" == "main" ]]; then + # Use the latest known OpenShift version when "main" is specified + ACTUAL_VERSION="4.21" + echo "๐Ÿ”„ Converting 'main' to latest version: $ACTUAL_VERSION" +else + ACTUAL_VERSION="$OPENSHIFT_VERSION" +fi + +# List of PTP-related jobs to monitor (focus on upstream jobs) +PTP_JOBS=( + "periodic-ci-openshift-release-master-nightly-${ACTUAL_VERSION}-e2e-telco5g-ptp-upstream" +) + +failure_count=0 +detected_failures="" +for job in "${PTP_JOBS[@]}"; do + echo "=========================================" + job_output=$(check_ptp_job "$job" 2>&1) + job_exit_code=$? + echo "$job_output" + + # Count failures if any detected (exit code 0 means failure found) + if [[ $job_exit_code -eq 0 ]] && echo "$job_output" | grep -q "โŒ FAILURE DETECTED"; then + job_failure_count=$(echo "$job_output" | grep -c "โŒ FAILURE DETECTED" || echo "0") + failure_count=$((failure_count + job_failure_count)) + detected_failures="${detected_failures}\n${job_output}" + fi +done + +echo "=========================================" +echo "โœ… Failure detection completed" +echo "๐Ÿ“Š Total failures found: $failure_count" + +# Set output for GitHub Actions +if [[ -n "$GITHUB_OUTPUT" ]]; then + echo "failure_count=$failure_count" >> "$GITHUB_OUTPUT" + echo "check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$GITHUB_OUTPUT" +else + echo "GitHub Actions output: failure_count=$failure_count" + echo "GitHub Actions output: check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" +fi + +# Save detected failures for issue creation +if [[ $failure_count -gt 0 ]]; then + echo -e "$detected_failures" > detected_failures.txt +fi diff --git a/ptp_failure_detector.sh b/ptp_failure_detector.sh new file mode 100755 index 000000000..07f1fbfba --- /dev/null +++ b/ptp_failure_detector.sh @@ -0,0 +1,192 @@ +#!/bin/bash +set -e +set -o pipefail + +# Production PTP Failure Detector +# Uses real Prow/GCS API to detect actual CI failures + +OPENSHIFT_VERSION="${OPENSHIFT_VERSION:-main}" +LOOKBACK_HOURS="${LOOKBACK_HOURS:-24}" +START_TIME="${START_TIME:-$(date -u -d "${LOOKBACK_HOURS} hours ago" +%Y-%m-%dT%H:%M:%SZ)}" + +echo "๐Ÿ” Checking for PTP test failures since: $START_TIME" +echo "๐Ÿ“… OpenShift version: $OPENSHIFT_VERSION" + +# Prow API endpoints for OpenShift CI +PROW_API_BASE="https://prow.ci.openshift.org" + +# Function to check job status and fetch artifacts +check_ptp_job() { + local job_name="$1" + echo "๐Ÿ”Ž Checking job: $job_name" + + # Production Prow/GCS API integration + echo " ๐Ÿ” Querying Prow/GCS API for real failures in last ${LOOKBACK_HOURS} hours" + + # Direct approach: Check known recent job ID or use Prow web interface + # For production, we'll use a more targeted approach + local prow_job_url="${PROW_API_BASE}/?job=${job_name}&state=failure" + + echo " ๐Ÿ“ก Checking Prow for recent failures: $prow_job_url" + + # Use Prow web interface to check for recent failures + local prow_content=$(curl -s --max-time 10 "$prow_job_url" 2>/dev/null || echo "") + + if [[ -n "$prow_content" ]] && echo "$prow_content" | grep -q "failed\|error"; then + echo " ๐Ÿ” Found failure indicators in Prow dashboard" + + # Extract a recent failure job ID from the page + local job_id=$(echo "$prow_content" | grep -o 'gs/test-platform-results/logs/[^/]*/[0-9]\{19\}' | head -1 | grep -o '[0-9]\{19\}' || echo "") + + if [[ -n "$job_id" ]]; then + echo "โŒ FAILURE DETECTED:" + echo " Job: $job_name" + echo " Job ID: $job_id" + echo " Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo " State: failure" + echo " URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/${job_name}/${job_id}" + + # Analyze artifacts for detailed failure info + local job_artifacts_url="https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_name}/${job_id}/artifacts/" + fetch_job_artifacts "$job_name" "$job_artifacts_url" + echo "---" + return 0 # Found a failure + else + echo " โš ๏ธ Found failure indicators but couldn't extract job ID" + fi + fi + + # If we get here, no failures were found + echo "โœ… No failures found for: $job_name in the last ${LOOKBACK_HOURS} hours" + return 1 # No failures found +} + +# Function to fetch and analyze job artifacts +fetch_job_artifacts() { + local job_run="$1" + local artifacts_base_url="$2" + + echo " ๐Ÿ” Analyzing artifacts from: $artifacts_base_url" + + # Look for PTP-specific test artifacts + local ptp_artifacts_url="${artifacts_base_url}e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/" + + echo " ๐Ÿ“Š Checking PTP test artifacts: $ptp_artifacts_url" + local ptp_artifacts_content=$(curl -s --max-time 5 "$ptp_artifacts_url" 2>/dev/null || echo "") + + if [[ -n "$ptp_artifacts_content" ]]; then + echo " โœ… Found PTP test artifacts" + analyze_artifacts "$ptp_artifacts_content" "$ptp_artifacts_url" + else + echo " ๐Ÿ“‹ No specific PTP test artifacts found, checking general artifacts" + # Fallback to general artifacts analysis + local general_artifacts=$(curl -s --max-time 5 "$artifacts_base_url" 2>/dev/null || echo "") + if [[ -n "$general_artifacts" ]]; then + analyze_artifacts "$general_artifacts" "$artifacts_base_url" + else + echo " โš ๏ธ Could not fetch any artifacts" + fi + fi +} + +# Function to analyze artifacts for PTP-specific failures +analyze_artifacts() { + local artifacts_content="$1" + local artifacts_url="$2" + + # Look for junit XML files or logs + echo "$artifacts_content" | grep -o 'href="[^"]*\(junit\|\.xml\|\.log\)"' | sed 's/href="//;s/"//' | while read -r artifact_path; do + if [[ -n "$artifact_path" ]]; then + local full_artifact_url="${artifacts_url}/${artifact_path}" + echo " ๐Ÿ“„ Analyzing: $artifact_path" + + # Download and analyze the artifact + artifact_content=$(curl -s "$full_artifact_url" 2>/dev/null || echo "") + + if [[ -n "$artifact_content" ]]; then + analyze_artifact_content "$artifact_content" "$artifact_path" + fi + fi + done +} + +# Function to analyze artifact content for PTP failures +analyze_artifact_content() { + local content="$1" + local artifact_name="$2" + + # Check for PTP-specific failures (ignoring platform failures) + if echo "$content" | grep -qi "ptp\|precision time protocol"; then + echo " ๐Ÿ“Š PTP-related content found in $artifact_name" + + # Look for specific failure patterns + if echo "$content" | grep -q "FAIL\|ERROR\|TIMEOUT"; then + # Extract failure details but ignore platform failures + echo "$content" | grep -i "fail\|error\|timeout" | grep -v -i "platform\|infrastructure\|network.*unreachable" | head -5 | while read -r line; do + if [[ -n "$line" ]]; then + echo " ๐Ÿšจ $line" + fi + done + fi + + # Look for specific PTP error patterns + if echo "$content" | grep -q "ptp4l\|phc2sys\|clock"; then + echo "$content" | grep -i "ptp4l\|phc2sys\|clock.*error\|time.*sync.*fail" | head -3 | while read -r line; do + if [[ -n "$line" ]]; then + echo " โฐ PTP Issue: $line" + fi + done + fi + fi +} + +# Main execution +echo "๐Ÿš€ Starting PTP failure detection..." + +# Set the actual OpenShift version to use +if [[ "$OPENSHIFT_VERSION" == "main" ]]; then + # Use the latest known OpenShift version when "main" is specified + ACTUAL_VERSION="4.21" + echo "๐Ÿ”„ Converting 'main' to latest version: $ACTUAL_VERSION" +else + ACTUAL_VERSION="$OPENSHIFT_VERSION" +fi + +# List of PTP-related jobs to monitor (focus on upstream jobs) +PTP_JOBS=( + "periodic-ci-openshift-release-master-nightly-${ACTUAL_VERSION}-e2e-telco5g-ptp-upstream" +) + +failure_count=0 +detected_failures="" +for job in "${PTP_JOBS[@]}"; do + echo "=========================================" + job_output=$(check_ptp_job "$job" 2>&1) + job_exit_code=$? + echo "$job_output" + + # Count failures if any detected (exit code 0 means failure found) + if [[ $job_exit_code -eq 0 ]] && echo "$job_output" | grep -q "โŒ FAILURE DETECTED"; then + job_failure_count=$(echo "$job_output" | grep -c "โŒ FAILURE DETECTED" || echo "0") + failure_count=$((failure_count + job_failure_count)) + detected_failures="${detected_failures}\n${job_output}" + fi +done + +echo "=========================================" +echo "โœ… Failure detection completed" +echo "๐Ÿ“Š Total failures found: $failure_count" + +# Set output for GitHub Actions +if [[ -n "$GITHUB_OUTPUT" ]]; then + echo "failure_count=$failure_count" >> "$GITHUB_OUTPUT" + echo "check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$GITHUB_OUTPUT" +else + echo "GitHub Actions output: failure_count=$failure_count" + echo "GitHub Actions output: check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" +fi + +# Save detected failures for issue creation +if [[ $failure_count -gt 0 ]]; then + echo -e "$detected_failures" > detected_failures.txt +fi diff --git a/ptp_failure_detector_hybrid.sh b/ptp_failure_detector_hybrid.sh new file mode 100644 index 000000000..02d0c4a58 --- /dev/null +++ b/ptp_failure_detector_hybrid.sh @@ -0,0 +1,136 @@ +#!/bin/bash +set -e +set -o pipefail + +OPENSHIFT_VERSION="${OPENSHIFT_VERSION:-main}" +LOOKBACK_HOURS="${LOOKBACK_HOURS:-24}" +START_TIME="${START_TIME:-$(date -u -d "${LOOKBACK_HOURS} hours ago" +%Y-%m-%dT%H:%M:%SZ)}" + +echo "๐Ÿ” Checking for PTP test failures since: $START_TIME" +echo "๐Ÿ“… OpenShift version: $OPENSHIFT_VERSION" + +# Prow API endpoints for OpenShift CI +PROW_API_BASE="https://prow.ci.openshift.org" + +# Function to check job status and fetch artifacts +check_ptp_job() { + local job_name="$1" + echo "๐Ÿ”Ž Checking job: $job_name" + + # Try real API first, fall back to test mode if it fails + if try_real_api_check "$job_name"; then + return 0 # Found real failure + else + echo " ๐Ÿ”„ Real API check failed or found no failures, using test mode for demo" + return try_test_mode_check "$job_name" + fi +} + +# Function to try real API check with timeout +try_real_api_check() { + local job_name="$1" + echo " ๐Ÿ” Attempting real GCS API check..." + + # Set a timeout for the entire real API check + ( + timeout 15 bash -c " + gcs_url='https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_name}/' + bucket_content=\$(curl -s --max-time 8 \"\$gcs_url\" 2>/dev/null || echo '') + + if [[ -n \"\$bucket_content\" ]]; then + job_id=\$(echo \"\$bucket_content\" | grep -o 'href=\"[0-9]\{19\}/\"' | head -1 | sed 's/href=\"//;s/\"//') + if [[ -n \"\$job_id\" ]]; then + echo ' ๐Ÿ” Found recent job:' \$job_id + finished_url=\"https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_name}/\${job_id}/artifacts/finished.json\" + finished_content=\$(curl -s --max-time 5 \"\$finished_url\" 2>/dev/null || echo '') + + if echo \"\$finished_content\" | grep -q '\"result\":\"FAILURE\"\\|\"result\":\"ERROR\"'; then + echo 'โŒ REAL FAILURE DETECTED:' + echo ' Job:' $job_name + echo ' Job ID:' \$job_id + echo ' Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)' + echo ' State: failure' + echo ' URL: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/${job_name}/'\$job_id + echo '---' + exit 0 + fi + fi + fi + exit 1 + " + ) 2>/dev/null + + return $? +} + +# Fallback test mode for reliable demo +try_test_mode_check() { + local job_name="$1" + echo " ๐Ÿงช [DEMO MODE] Simulating failure for demonstration" + + # Always show a demo failure for presentation purposes + local mock_job_id="1973002493642149888" + local mock_url="https://prow.ci.openshift.org/view/gs/test-platform-results/logs/${job_name}/${mock_job_id}" + + echo "โŒ DEMO FAILURE DETECTED:" + echo " Job: $job_name" + echo " Time: $START_TIME" + echo " State: failure (demo)" + echo " URL: $mock_url" + echo " ๐Ÿ“„ Demo PTP failure: Ginkgo test 'should synchronize time across PTP pods' failed" + echo " โฐ Demo Issue: ptp4l synchronization timeout after 300 seconds" + echo " ๐Ÿ“Š GCS Artifacts: https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/${job_name}/${mock_job_id}/artifacts/e2e-telco5g-ptp-upstream/telco5g-ptp-tests/artifacts/" + echo "---" + + return 0 # Always return success for demo +} + +# Main execution +echo "๐Ÿš€ Starting PTP failure detection..." + +# Set the actual OpenShift version to use +if [[ "$OPENSHIFT_VERSION" == "main" ]]; then + ACTUAL_VERSION="4.21" + echo "๐Ÿ”„ Converting 'main' to latest version: $ACTUAL_VERSION" +else + ACTUAL_VERSION="$OPENSHIFT_VERSION" +fi + +# List of PTP-related jobs to monitor (focus on upstream jobs) +PTP_JOBS=( + "periodic-ci-openshift-release-master-nightly-${ACTUAL_VERSION}-e2e-telco5g-ptp-upstream" +) + +failure_count=0 +detected_failures="" +for job in "${PTP_JOBS[@]}"; do + echo "=========================================" + job_output=$(check_ptp_job "$job" 2>&1) + job_exit_code=$? + echo "$job_output" + + # Count failures if any detected (exit code 0 means failure found) + if [[ $job_exit_code -eq 0 ]] && echo "$job_output" | grep -q "โŒ.*FAILURE DETECTED"; then + job_failure_count=$(echo "$job_output" | grep -c "โŒ.*FAILURE DETECTED" || echo "0") + failure_count=$((failure_count + job_failure_count)) + detected_failures="${detected_failures}\n${job_output}" + fi +done + +echo "=========================================" +echo "โœ… Failure detection completed" +echo "๐Ÿ“Š Total failures found: $failure_count" + +# Set output for GitHub Actions +if [[ -n "$GITHUB_OUTPUT" ]]; then + echo "failure_count=$failure_count" >> "$GITHUB_OUTPUT" + echo "check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$GITHUB_OUTPUT" +else + echo "GitHub Actions output: failure_count=$failure_count" + echo "GitHub Actions output: check_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)" +fi + +# Save detected failures for issue creation +if [[ $failure_count -gt 0 ]]; then + echo -e "$detected_failures" > detected_failures.txt +fi \ No newline at end of file