ci: add test binaries workflow

Minh141120 · Minh141120 · commit 3fb72a6f5a89 · 2025-07-31T11:10:33.000+07:00
diff --git a/.github/workflows/test-binaries.yml b/.github/workflows/test-binaries.yml
@@ -0,0 +1,230 @@
+name: Test Binaries
+
+on:
+  
+  pull_request:
+    branches:
+      - dev
+    paths:
+      - '.github/workflows/test-binaries.yml'
+      - '.github/workflows/menlo-build.yml
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version to test (e.g., b5509, b5857)'
+        required: false
+        default: 'latest'
+
+jobs:
+  test-binaries:
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "linux"
+            name: "noavx-x64"
+            runs-on: "ubuntu-20.04"
+            binary-name: "llama-server"
+            artifact-name: "llama-linux-noavx-x64"
+          - os: "linux"
+            name: "avx-x64"
+            runs-on: "ubuntu-20.04"
+            binary-name: "llama-server"
+            artifact-name: "llama-linux-avx-x64"
+          - os: "linux"
+            name: "avx512-x64"
+            runs-on: "ubuntu-20.04"
+            binary-name: "llama-server"
+            artifact-name: "llama-linux-avx512-x64"
+          - os: "linux"
+            name: "vulkan-x64"
+            runs-on: "ubuntu-22.04"
+            binary-name: "llama-server"
+            artifact-name: "llama-linux-vulkan-x64"
+          - os: "macos"
+            name: "x64"
+            runs-on: "macos-selfhosted-12"
+            binary-name: "llama-server"
+            artifact-name: "llama-macos-x64"
+          - os: "macos"
+            name: "arm64"
+            runs-on: "macos-selfhosted-12-arm64"
+            binary-name: "llama-server"
+            artifact-name: "llama-macos-arm64"
+          - os: "win"
+            name: "avx2-x64"
+            runs-on: "windows-cuda-11-7"
+            binary-name: "llama-server.exe"
+            artifact-name: "llama-win-avx2-x64"
+          - os: "win"
+            name: "noavx-x64"
+            runs-on: "windows-cuda-11-7"
+            binary-name: "llama-server.exe"
+            artifact-name: "llama-win-noavx-x64"
+          - os: "win"
+            name: "avx-x64"
+            runs-on: "windows-cuda-12-0"
+            binary-name: "llama-server.exe"
+            artifact-name: "llama-win-avx-x64"
+          - os: "win"
+            name: "avx512-x64"
+            runs-on: "windows-cuda-12-0"
+            binary-name: "llama-server.exe"
+            artifact-name: "llama-win-avx512-x64"
+          - os: "win"
+            name: "vulkan-x64"
+            runs-on: "windows-cuda-11-7"
+            binary-name: "llama-server.exe"
+            artifact-name: "llama-win-vulkan-x64"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Download latest artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.artifact-name }}
+          path: ./artifacts
+
+      - name: Extract artifacts
+        run: |
+          # Find the tar.gz file and extract it
+          find ./artifacts -name "*.tar.gz" -exec tar -xzf {} \;
+          
+          # Move the extracted directory to llama/
+          find . -maxdepth 1 -type d -name "llama-*" -exec mv {} llama \;
+
+      - name: Make binary executable (Linux/macOS)
+        if: runner.os != 'Windows'
+        run: |
+          chmod +x ./llama/build/bin/${{ matrix.binary-name }}
+
+      - name: Download test model
+        run: |
+          mkdir -p models
+          curl -L -o models/Lucy-Q4_0.gguf "https://huggingface.co/Menlo/Lucy-gguf/resolve/main/Lucy-Q4_0.gguf"
+
+      - name: Test server startup (Linux/macOS)
+        if: runner.os != 'Windows'
+        run: |
+          echo "Testing ${{ matrix.binary-name }} startup..."
+          timeout 30s ./llama/build/bin/${{ matrix.binary-name }} --model models/Lucy-Q4_0.gguf --server --port 8080 --host 0.0.0.0 &
+          SERVER_PID=$!
+          
+          # Wait a bit for server to start
+          sleep 10
+          
+          # Test if server is responding
+          if curl -s http://localhost:8080/health > /dev/null; then
+            echo "✅ Server started successfully and is responding"
+            kill $SERVER_PID
+            exit 0
+          else
+            echo "❌ Server failed to start or respond"
+            kill $SERVER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+      - name: Test inference (Linux/macOS)
+        if: runner.os != 'Windows'
+        run: |
+          echo "Testing inference with ${{ matrix.binary-name }}..."
+          
+          # Start server
+          ./llama/build/bin/${{ matrix.binary-name }} --model models/Lucy-Q4_0.gguf --server --port 8080 --host 0.0.0.0 &
+          SERVER_PID=$!
+          
+          # Wait for server to start
+          sleep 15
+          
+          # Test inference
+          curl -X POST http://localhost:8080/completion \
+            -H "Content-Type: application/json" \
+            -d '{
+              "prompt": "Hello, how are you?",
+              "n_predict": 10,
+              "temperature": 0.7,
+              "stop": ["\n", "User:", "Assistant:"]
+            }' > response.json
+          
+          # Check if we got a valid response
+          if [ -s response.json ] && grep -q "content" response.json; then
+            echo "✅ Inference test passed"
+            cat response.json
+            kill $SERVER_PID
+            exit 0
+          else
+            echo "❌ Inference test failed"
+            cat response.json
+            kill $SERVER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+      - name: Test server startup (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          Write-Host "Testing ${{ matrix.binary-name }} startup..."
+          
+          # Start server in background
+          Start-Process -FilePath ".\llama\build\bin\${{ matrix.binary-name }}" -ArgumentList "--model", "models\Lucy-Q4_0.gguf", "--server", "--port", "8080", "--host", "0.0.0.0" -WindowStyle Hidden
+          
+          # Wait for server to start
+          Start-Sleep -Seconds 10
+          
+          # Test if server is responding
+          try {
+            $response = Invoke-RestMethod -Uri "http://localhost:8080/health" -Method Get
+            Write-Host "✅ Server started successfully and is responding"
+            Stop-Process -Name "llama-server" -Force -ErrorAction SilentlyContinue
+            exit 0
+          } catch {
+            Write-Host "❌ Server failed to start or respond"
+            Stop-Process -Name "llama-server" -Force -ErrorAction SilentlyContinue
+            exit 1
+          }
+
+      - name: Test inference (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          Write-Host "Testing inference with ${{ matrix.binary-name }}..."
+          
+          # Start server in background
+          Start-Process -FilePath ".\llama\build\bin\${{ matrix.binary-name }}" -ArgumentList "--model", "models\Lucy-Q4_0.gguf", "--server", "--port", "8080", "--host", "0.0.0.0" -WindowStyle Hidden
+          
+          # Wait for server to start
+          Start-Sleep -Seconds 15
+          
+          # Test inference
+          $body = @{
+            prompt = "Hello, how are you?"
+            n_predict = 10
+            temperature = 0.7
+            stop = @("\n", "User:", "Assistant:")
+          } | ConvertTo-Json
+          
+          try {
+            $response = Invoke-RestMethod -Uri "http://localhost:8080/completion" -Method Post -Body $body -ContentType "application/json"
+            Write-Host "✅ Inference test passed"
+            $response | ConvertTo-Json -Depth 10
+            Stop-Process -Name "llama-server" -Force -ErrorAction SilentlyContinue
+            exit 0
+          } catch {
+            Write-Host "❌ Inference test failed"
+            Write-Host $_.Exception.Message
+            Stop-Process -Name "llama-server" -Force -ErrorAction SilentlyContinue
+            exit 1
+          }
+
+      - name: Upload test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: test-results-${{ matrix.os }}-${{ matrix.name }}
+          path: |
+            response.json
+            *.log
+          retention-days: 1