inductor-perf-b200 #40
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | name: inductor-perf-b200 | |
| on: | |
| schedule: | |
| - cron: 0 7 * * 1-6 | |
| - cron: 0 7 * * 0 | |
| # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it | |
| # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs | |
| workflow_dispatch: | |
| inputs: | |
| training: | |
| description: Run training (on by default)? | |
| required: false | |
| type: boolean | |
| default: true | |
| inference: | |
| description: Run inference (on by default)? | |
| required: false | |
| type: boolean | |
| default: true | |
| default: | |
| description: Run inductor_default? | |
| required: false | |
| type: boolean | |
| default: false | |
| dynamic: | |
| description: Run inductor_dynamic_shapes? | |
| required: false | |
| type: boolean | |
| default: false | |
| cppwrapper: | |
| description: Run inductor_cpp_wrapper? | |
| required: false | |
| type: boolean | |
| default: false | |
| cudagraphs: | |
| description: Run inductor_cudagraphs? | |
| required: false | |
| type: boolean | |
| default: true | |
| freezing_cudagraphs: | |
| description: Run inductor_cudagraphs with freezing for inference? | |
| required: false | |
| type: boolean | |
| default: false | |
| aotinductor: | |
| description: Run aot_inductor for inference? | |
| required: false | |
| type: boolean | |
| default: false | |
| maxautotune: | |
| description: Run inductor_max_autotune? | |
| required: false | |
| type: boolean | |
| default: false | |
| benchmark_configs: | |
| description: The list of configs used the benchmark | |
| required: false | |
| type: string | |
| default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200 | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| permissions: | |
| id-token: write | |
| contents: read | |
| jobs: | |
| get-label-type: | |
| name: get-label-type | |
| uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | |
| if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | |
| with: | |
| triggering_actor: ${{ github.triggering_actor }} | |
| issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | |
| curr_branch: ${{ github.head_ref || github.ref_name }} | |
| curr_ref_type: ${{ github.ref_type }} | |
| opt_out_experiments: lf | |
| build: | |
| name: cuda12.8-py3.10-gcc9-sm100 | |
| uses: ./.github/workflows/_linux-build.yml | |
| needs: get-label-type | |
| with: | |
| runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | |
| # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100 | |
| # or newer GPUs, so it doesn't benefit much from existing compiler cache | |
| # from trunk. Also use a memory-intensive runner here because memory is | |
| # usually the bottleneck | |
| runner: linux.12xlarge.memory | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | |
| docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks | |
| cuda-arch-list: '10.0' | |
| test-matrix: | | |
| { include: [ | |
| { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | |
| { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | |
| { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | |
| ]} | |
| selected-test-configs: ${{ inputs.benchmark_configs }} | |
| build-additional-packages: "vision audio fbgemm torchao" | |
| secrets: inherit | |
| test-periodically: | |
| name: cuda12.8-py3.10-gcc9-sm100 | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| if: github.event.schedule == '0 7 * * 1-6' | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | |
| dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | |
| timeout-minutes: 720 | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit | |
| test-weekly: | |
| name: cuda12.8-py3.10-gcc9-sm100 | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| if: github.event.schedule == '0 7 * * 0' | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | |
| dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| timeout-minutes: 1440 | |
| aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit | |
| test: | |
| name: cuda12.8-py3.10-gcc9-sm100 | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | |
| dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | |
| timeout-minutes: 720 | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit |