Skip to content

Commit b4efd14

Browse files
authored
Merge pull request #55 from mayank31398/ontocord
add ROCm devices support
2 parents beaf2f2 + 21045b5 commit b4efd14

31 files changed

+2853
-124
lines changed

megatron/data/indexed_dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,8 @@ def write_longs(f, a):
9595
3: np.int16,
9696
4: np.int32,
9797
5: np.int64,
98-
6: np.float,
99-
7: np.double,
98+
6: np.float32,
99+
7: np.float64,
100100
8: np.uint16
101101
}
102102

@@ -268,8 +268,8 @@ class IndexedDatasetBuilder(object):
268268
np.int16: 2,
269269
np.int32: 4,
270270
np.int64: 8,
271-
np.float: 4,
272-
np.double: 8
271+
np.float32: 4,
272+
np.float64: 8
273273
}
274274

275275
def __init__(self, out_file, dtype=np.int32):

megatron/fused_kernels/__init__.py

Lines changed: 9 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,12 @@
1-
# coding=utf-8
2-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3-
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
7-
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
9-
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
15-
16-
import os
17-
import pathlib
18-
import subprocess
19-
20-
from torch.utils import cpp_extension
21-
22-
# Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile
23-
# os.environ["TORCH_CUDA_ARCH_LIST"] = ""
1+
import torch
242

253

264
def load(args):
27-
28-
# Check if cuda 11 is installed for compute capability 8.0
29-
cc_flag = []
30-
_, bare_metal_major, _ = _get_cuda_bare_metal_version(
31-
cpp_extension.CUDA_HOME)
32-
if int(bare_metal_major) >= 11:
33-
cc_flag.append('-gencode')
34-
cc_flag.append('arch=compute_80,code=sm_80')
35-
36-
# Build path
37-
srcpath = pathlib.Path(__file__).parent.absolute()
38-
buildpath = srcpath / 'build'
39-
_create_build_dir(buildpath)
40-
41-
# Helper function to build the kernels.
42-
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
43-
return cpp_extension.load(
44-
name=name,
45-
sources=sources,
46-
build_directory=buildpath,
47-
extra_cflags=['-O3',],
48-
extra_cuda_cflags=['-O3',
49-
'-gencode', 'arch=compute_70,code=sm_70',
50-
'--use_fast_math'] + extra_cuda_flags + cc_flag,
51-
verbose=(args.rank == 0)
52-
)
53-
54-
# ==============
55-
# Fused softmax.
56-
# ==============
57-
58-
if args.masked_softmax_fusion:
59-
extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
60-
'-U__CUDA_NO_HALF_CONVERSIONS__',
61-
'--expt-relaxed-constexpr',
62-
'--expt-extended-lambda']
63-
64-
# Upper triangular softmax.
65-
sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
66-
srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
67-
scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
68-
"scaled_upper_triang_masked_softmax_cuda",
69-
sources, extra_cuda_flags)
70-
71-
# Masked softmax.
72-
sources=[srcpath / 'scaled_masked_softmax.cpp',
73-
srcpath / 'scaled_masked_softmax_cuda.cu']
74-
scaled_masked_softmax_cuda = _cpp_extention_load_helper(
75-
"scaled_masked_softmax_cuda", sources, extra_cuda_flags)
76-
77-
# Softmax
78-
sources=[srcpath / 'scaled_softmax.cpp',
79-
srcpath / 'scaled_softmax_cuda.cu']
80-
scaled_softmax_cuda = _cpp_extention_load_helper(
81-
"scaled_softmax_cuda", sources, extra_cuda_flags)
82-
83-
# =================================
84-
# Mixed precision fused layer norm.
85-
# =================================
86-
87-
extra_cuda_flags = ['-maxrregcount=50']
88-
sources=[srcpath / 'layer_norm_cuda.cpp',
89-
srcpath / 'layer_norm_cuda_kernel.cu']
90-
fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
91-
"fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)
92-
93-
# =================================
94-
# Fused gradient accumulation to weight gradient computation of linear layer
95-
# =================================
96-
97-
if args.gradient_accumulation_fusion:
98-
sources=[srcpath / 'fused_weight_gradient_dense.cpp',
99-
srcpath / 'fused_weight_gradient_dense.cu']
100-
fused_dense_cuda = _cpp_extention_load_helper(
101-
"fused_dense_cuda", sources, [])
102-
103-
104-
def _get_cuda_bare_metal_version(cuda_dir):
105-
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
106-
universal_newlines=True)
107-
output = raw_output.split()
108-
release_idx = output.index("release") + 1
109-
release = output[release_idx].split(".")
110-
bare_metal_major = release[0]
111-
bare_metal_minor = release[1][0]
112-
113-
return raw_output, bare_metal_major, bare_metal_minor
114-
115-
116-
def _create_build_dir(buildpath):
117-
try:
118-
os.mkdir(buildpath)
119-
except OSError:
120-
if not os.path.isdir(buildpath):
121-
print(f"Creation of the build directory {buildpath} failed")
5+
if torch.version.hip is None:
6+
print("running on CUDA devices")
7+
from megatron.fused_kernels.cuda import load as load_kernels
8+
else:
9+
print("running on ROCm devices")
10+
from megatron.fused_kernels.rocm import load as load_kernels
11+
12+
load_kernels(args)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# coding=utf-8
2+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import pathlib
17+
import subprocess
18+
19+
from torch.utils import cpp_extension
20+
from megatron.fused_kernels.utils import _create_build_dir
21+
22+
# Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile
23+
# os.environ["TORCH_CUDA_ARCH_LIST"] = ""
24+
25+
26+
def load(args):
27+
# Check if cuda 11 is installed for compute capability 8.0
28+
cc_flag = []
29+
_, bare_metal_major, _ = _get_cuda_bare_metal_version(
30+
cpp_extension.CUDA_HOME)
31+
if int(bare_metal_major) >= 11:
32+
cc_flag.append('-gencode')
33+
cc_flag.append('arch=compute_80,code=sm_80')
34+
35+
# Build path
36+
srcpath = pathlib.Path(__file__).parent.absolute()
37+
buildpath = srcpath / 'build'
38+
_create_build_dir(buildpath)
39+
40+
# Helper function to build the kernels.
41+
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
42+
return cpp_extension.load(
43+
name=name,
44+
sources=sources,
45+
build_directory=buildpath,
46+
extra_cflags=['-O3',],
47+
extra_cuda_cflags=['-O3',
48+
'-gencode', 'arch=compute_70,code=sm_70',
49+
'--use_fast_math'] + extra_cuda_flags + cc_flag,
50+
verbose=(args.rank == 0)
51+
)
52+
53+
# ==============
54+
# Fused softmax.
55+
# ==============
56+
57+
if args.masked_softmax_fusion:
58+
extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
59+
'-U__CUDA_NO_HALF_CONVERSIONS__',
60+
'--expt-relaxed-constexpr',
61+
'--expt-extended-lambda']
62+
63+
# Upper triangular softmax.
64+
sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
65+
srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
66+
scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
67+
"scaled_upper_triang_masked_softmax_cuda",
68+
sources, extra_cuda_flags)
69+
70+
# Masked softmax.
71+
sources=[srcpath / 'scaled_masked_softmax.cpp',
72+
srcpath / 'scaled_masked_softmax_cuda.cu']
73+
scaled_masked_softmax_cuda = _cpp_extention_load_helper(
74+
"scaled_masked_softmax_cuda", sources, extra_cuda_flags)
75+
76+
# Softmax
77+
sources=[srcpath / 'scaled_softmax.cpp',
78+
srcpath / 'scaled_softmax_cuda.cu']
79+
scaled_softmax_cuda = _cpp_extention_load_helper(
80+
"scaled_softmax_cuda", sources, extra_cuda_flags)
81+
82+
# =================================
83+
# Mixed precision fused layer norm.
84+
# =================================
85+
86+
extra_cuda_flags = ['-maxrregcount=50']
87+
sources=[srcpath / 'layer_norm_cuda.cpp',
88+
srcpath / 'layer_norm_cuda_kernel.cu']
89+
fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
90+
"fused_layer_norm_cuda", sources, extra_cuda_flags)
91+
92+
# =================================
93+
# Fused gradient accumulation to weight gradient computation of linear layer
94+
# =================================
95+
96+
if args.gradient_accumulation_fusion:
97+
sources=[srcpath / 'fused_weight_gradient_dense.cpp',
98+
srcpath / 'fused_weight_gradient_dense.cu']
99+
fused_dense_cuda = _cpp_extention_load_helper(
100+
"fused_dense_cuda", sources, [])
101+
102+
103+
def _get_cuda_bare_metal_version(cuda_dir):
104+
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
105+
universal_newlines=True)
106+
output = raw_output.split()
107+
release_idx = output.index("release") + 1
108+
release = output[release_idx].split(".")
109+
bare_metal_major = release[0]
110+
bare_metal_minor = release[1][0]
111+
112+
return raw_output, bare_metal_major, bare_metal_minor
File renamed without changes.

0 commit comments

Comments
 (0)