diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 448987e8d6b..cb4c29ea522 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -176,12 +176,12 @@ jobs: matrix: quant: - name: "non-quantized" - artifact: "voxtral-cuda-export" + artifact: "gemma3-cuda-export" extra_args: "" - # TODO: enable gemma3 quantization - # - name: "quantized-int4-tile-packed" - # artifact: "voxtral-cuda-quantized-int4-tile-packed" - # extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" + - name: "quantized-int4-tile-packed" + artifact: "gemma3-cuda-quantized-int4-tile-packed" + extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" + # TODO: enable int4-weight-only on gemma3. # - name: "quantized-int4-weight-only" # artifact: "voxtral-cuda-quantized-int4-weight-only" # # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation. @@ -194,7 +194,7 @@ jobs: gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive - upload-artifact: gemma3-cuda-export + upload-artifact: ${{ matrix.quant.artifact }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux @@ -435,9 +435,9 @@ jobs: format: - name: "non-quantized" artifact: "gemma3-cuda-export" - # TODO: enable quantized gemma3. - # - name: "quantized-int4-tile-packed" - # artifact: "gemma3-cuda-quantized-int4-tile-packed" + - name: "quantized-int4-tile-packed" + artifact: "gemma3-cuda-quantized-int4-tile-packed" + # TODO: enable int4-weight-only on gemma3. # - name: "quantized-int4-weight-only" # artifact: "gemma3-cuda-quantized-int4-weight-only" with: