|
| 1 | +name: inductor-perf-b200 |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + - cron: 0 7 * * 1-6 |
| 6 | + - cron: 0 7 * * 0 |
| 7 | + # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it |
| 8 | + # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs |
| 9 | + workflow_dispatch: |
| 10 | + inputs: |
| 11 | + training: |
| 12 | + description: Run training (on by default)? |
| 13 | + required: false |
| 14 | + type: boolean |
| 15 | + default: true |
| 16 | + inference: |
| 17 | + description: Run inference (on by default)? |
| 18 | + required: false |
| 19 | + type: boolean |
| 20 | + default: true |
| 21 | + default: |
| 22 | + description: Run inductor_default? |
| 23 | + required: false |
| 24 | + type: boolean |
| 25 | + default: false |
| 26 | + dynamic: |
| 27 | + description: Run inductor_dynamic_shapes? |
| 28 | + required: false |
| 29 | + type: boolean |
| 30 | + default: false |
| 31 | + cppwrapper: |
| 32 | + description: Run inductor_cpp_wrapper? |
| 33 | + required: false |
| 34 | + type: boolean |
| 35 | + default: false |
| 36 | + cudagraphs: |
| 37 | + description: Run inductor_cudagraphs? |
| 38 | + required: false |
| 39 | + type: boolean |
| 40 | + default: true |
| 41 | + freezing_cudagraphs: |
| 42 | + description: Run inductor_cudagraphs with freezing for inference? |
| 43 | + required: false |
| 44 | + type: boolean |
| 45 | + default: false |
| 46 | + aotinductor: |
| 47 | + description: Run aot_inductor for inference? |
| 48 | + required: false |
| 49 | + type: boolean |
| 50 | + default: false |
| 51 | + maxautotune: |
| 52 | + description: Run inductor_max_autotune? |
| 53 | + required: false |
| 54 | + type: boolean |
| 55 | + default: false |
| 56 | + benchmark_configs: |
| 57 | + description: The list of configs used the benchmark |
| 58 | + required: false |
| 59 | + type: string |
| 60 | + default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200 |
| 61 | + |
| 62 | +concurrency: |
| 63 | + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} |
| 64 | + cancel-in-progress: true |
| 65 | + |
| 66 | +permissions: |
| 67 | + id-token: write |
| 68 | + contents: read |
| 69 | + |
| 70 | +jobs: |
| 71 | + get-label-type: |
| 72 | + name: get-label-type |
| 73 | + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main |
| 74 | + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} |
| 75 | + with: |
| 76 | + triggering_actor: ${{ github.triggering_actor }} |
| 77 | + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |
| 78 | + curr_branch: ${{ github.head_ref || github.ref_name }} |
| 79 | + curr_ref_type: ${{ github.ref_type }} |
| 80 | + opt_out_experiments: lf |
| 81 | + |
| 82 | + build: |
| 83 | + name: cuda12.8-py3.10-gcc9-sm100 |
| 84 | + uses: ./.github/workflows/_linux-build.yml |
| 85 | + needs: get-label-type |
| 86 | + with: |
| 87 | + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |
| 88 | + # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100 |
| 89 | + # or newer GPUs, so it doesn't benefit much from existing compiler cache |
| 90 | + # from trunk. Also use a memory-intensive runner here because memory is |
| 91 | + # usually the bottleneck |
| 92 | + runner: linux.12xlarge.memory |
| 93 | + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 |
| 94 | + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks |
| 95 | + cuda-arch-list: '10.0' |
| 96 | + test-matrix: | |
| 97 | + { include: [ |
| 98 | + { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, |
| 99 | + { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, |
| 100 | + { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, |
| 101 | + ]} |
| 102 | + selected-test-configs: ${{ inputs.benchmark_configs }} |
| 103 | + build-additional-packages: "vision audio fbgemm torchao" |
| 104 | + secrets: inherit |
| 105 | + |
| 106 | + test-periodically: |
| 107 | + name: cuda12.8-py3.10-gcc9-sm100 |
| 108 | + uses: ./.github/workflows/_linux-test.yml |
| 109 | + needs: build |
| 110 | + if: github.event.schedule == '0 7 * * 1-6' |
| 111 | + with: |
| 112 | + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 |
| 113 | + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true |
| 114 | + docker-image: ${{ needs.build.outputs.docker-image }} |
| 115 | + test-matrix: ${{ needs.build.outputs.test-matrix }} |
| 116 | + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |
| 117 | + timeout-minutes: 720 |
| 118 | + disable-monitor: false |
| 119 | + monitor-log-interval: 15 |
| 120 | + monitor-data-collect-interval: 4 |
| 121 | + secrets: inherit |
| 122 | + |
| 123 | + test-weekly: |
| 124 | + name: cuda12.8-py3.10-gcc9-sm100 |
| 125 | + uses: ./.github/workflows/_linux-test.yml |
| 126 | + needs: build |
| 127 | + if: github.event.schedule == '0 7 * * 0' |
| 128 | + with: |
| 129 | + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 |
| 130 | + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true |
| 131 | + docker-image: ${{ needs.build.outputs.docker-image }} |
| 132 | + test-matrix: ${{ needs.build.outputs.test-matrix }} |
| 133 | + timeout-minutes: 1440 |
| 134 | + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |
| 135 | + disable-monitor: false |
| 136 | + monitor-log-interval: 15 |
| 137 | + monitor-data-collect-interval: 4 |
| 138 | + secrets: inherit |
| 139 | + |
| 140 | + test: |
| 141 | + name: cuda12.8-py3.10-gcc9-sm100 |
| 142 | + uses: ./.github/workflows/_linux-test.yml |
| 143 | + needs: build |
| 144 | + with: |
| 145 | + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 |
| 146 | + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} |
| 147 | + docker-image: ${{ needs.build.outputs.docker-image }} |
| 148 | + test-matrix: ${{ needs.build.outputs.test-matrix }} |
| 149 | + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |
| 150 | + timeout-minutes: 720 |
| 151 | + disable-monitor: false |
| 152 | + monitor-log-interval: 15 |
| 153 | + monitor-data-collect-interval: 4 |
| 154 | + secrets: inherit |
0 commit comments