|
1 | | -# coding=utf-8 |
2 | | -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. |
3 | | -# |
4 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | -# you may not use this file except in compliance with the License. |
6 | | -# You may obtain a copy of the License at |
7 | | -# |
8 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
9 | | -# |
10 | | -# Unless required by applicable law or agreed to in writing, software |
11 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
12 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | -# See the License for the specific language governing permissions and |
14 | | -# limitations under the License. |
15 | | - |
16 | | -import os |
17 | | -import pathlib |
18 | | -import subprocess |
19 | | - |
20 | | -from torch.utils import cpp_extension |
21 | | - |
22 | | -# Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile |
23 | | -# os.environ["TORCH_CUDA_ARCH_LIST"] = "" |
| 1 | +import torch |
24 | 2 |
|
25 | 3 |
|
26 | 4 | def load(args): |
27 | | - |
28 | | - # Check if cuda 11 is installed for compute capability 8.0 |
29 | | - cc_flag = [] |
30 | | - _, bare_metal_major, _ = _get_cuda_bare_metal_version( |
31 | | - cpp_extension.CUDA_HOME) |
32 | | - if int(bare_metal_major) >= 11: |
33 | | - cc_flag.append('-gencode') |
34 | | - cc_flag.append('arch=compute_80,code=sm_80') |
35 | | - |
36 | | - # Build path |
37 | | - srcpath = pathlib.Path(__file__).parent.absolute() |
38 | | - buildpath = srcpath / 'build' |
39 | | - _create_build_dir(buildpath) |
40 | | - |
41 | | - # Helper function to build the kernels. |
42 | | - def _cpp_extention_load_helper(name, sources, extra_cuda_flags): |
43 | | - return cpp_extension.load( |
44 | | - name=name, |
45 | | - sources=sources, |
46 | | - build_directory=buildpath, |
47 | | - extra_cflags=['-O3',], |
48 | | - extra_cuda_cflags=['-O3', |
49 | | - '-gencode', 'arch=compute_70,code=sm_70', |
50 | | - '--use_fast_math'] + extra_cuda_flags + cc_flag, |
51 | | - verbose=(args.rank == 0) |
52 | | - ) |
53 | | - |
54 | | - # ============== |
55 | | - # Fused softmax. |
56 | | - # ============== |
57 | | - |
58 | | - if args.masked_softmax_fusion: |
59 | | - extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', |
60 | | - '-U__CUDA_NO_HALF_CONVERSIONS__', |
61 | | - '--expt-relaxed-constexpr', |
62 | | - '--expt-extended-lambda'] |
63 | | - |
64 | | - # Upper triangular softmax. |
65 | | - sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', |
66 | | - srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] |
67 | | - scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( |
68 | | - "scaled_upper_triang_masked_softmax_cuda", |
69 | | - sources, extra_cuda_flags) |
70 | | - |
71 | | - # Masked softmax. |
72 | | - sources=[srcpath / 'scaled_masked_softmax.cpp', |
73 | | - srcpath / 'scaled_masked_softmax_cuda.cu'] |
74 | | - scaled_masked_softmax_cuda = _cpp_extention_load_helper( |
75 | | - "scaled_masked_softmax_cuda", sources, extra_cuda_flags) |
76 | | - |
77 | | - # Softmax |
78 | | - sources=[srcpath / 'scaled_softmax.cpp', |
79 | | - srcpath / 'scaled_softmax_cuda.cu'] |
80 | | - scaled_softmax_cuda = _cpp_extention_load_helper( |
81 | | - "scaled_softmax_cuda", sources, extra_cuda_flags) |
82 | | - |
83 | | - # ================================= |
84 | | - # Mixed precision fused layer norm. |
85 | | - # ================================= |
86 | | - |
87 | | - extra_cuda_flags = ['-maxrregcount=50'] |
88 | | - sources=[srcpath / 'layer_norm_cuda.cpp', |
89 | | - srcpath / 'layer_norm_cuda_kernel.cu'] |
90 | | - fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( |
91 | | - "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags) |
92 | | - |
93 | | - # ================================= |
94 | | - # Fused gradient accumulation to weight gradient computation of linear layer |
95 | | - # ================================= |
96 | | - |
97 | | - if args.gradient_accumulation_fusion: |
98 | | - sources=[srcpath / 'fused_weight_gradient_dense.cpp', |
99 | | - srcpath / 'fused_weight_gradient_dense.cu'] |
100 | | - fused_dense_cuda = _cpp_extention_load_helper( |
101 | | - "fused_dense_cuda", sources, []) |
102 | | - |
103 | | - |
104 | | -def _get_cuda_bare_metal_version(cuda_dir): |
105 | | - raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], |
106 | | - universal_newlines=True) |
107 | | - output = raw_output.split() |
108 | | - release_idx = output.index("release") + 1 |
109 | | - release = output[release_idx].split(".") |
110 | | - bare_metal_major = release[0] |
111 | | - bare_metal_minor = release[1][0] |
112 | | - |
113 | | - return raw_output, bare_metal_major, bare_metal_minor |
114 | | - |
115 | | - |
116 | | -def _create_build_dir(buildpath): |
117 | | - try: |
118 | | - os.mkdir(buildpath) |
119 | | - except OSError: |
120 | | - if not os.path.isdir(buildpath): |
121 | | - print(f"Creation of the build directory {buildpath} failed") |
| 5 | + if torch.version.hip is None: |
| 6 | + print("running on CUDA devices") |
| 7 | + from megatron.fused_kernels.cuda import load as load_kernels |
| 8 | + else: |
| 9 | + print("running on ROCm devices") |
| 10 | + from megatron.fused_kernels.rocm import load as load_kernels |
| 11 | + |
| 12 | + load_kernels(args) |
0 commit comments