Skip to content

Commit 2bce175

Browse files
committed
Fixed Makefile.
1 parent 4bd1151 commit 2bce175

File tree

3 files changed

+27
-113
lines changed

3 files changed

+27
-113
lines changed

Makefile

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,6 @@ CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
4040
CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
4141

4242
# Later versions of CUDA support the new architectures
43-
CC_CUDA10x += -gencode arch=compute_75,code=sm_75
44-
45-
CC_CUDA110 := -gencode arch=compute_75,code=sm_75
46-
CC_CUDA110 += -gencode arch=compute_80,code=sm_80
47-
4843
CC_CUDA11x := -gencode arch=compute_75,code=sm_75
4944
CC_CUDA11x += -gencode arch=compute_80,code=sm_80
5045
CC_CUDA11x += -gencode arch=compute_86,code=sm_86
@@ -54,8 +49,8 @@ CC_cublasLt110 := -gencode arch=compute_75,code=sm_75
5449
CC_cublasLt110 += -gencode arch=compute_80,code=sm_80
5550

5651
CC_cublasLt111 := -gencode arch=compute_75,code=sm_75
57-
#CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
58-
#CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
52+
CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
53+
CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
5954

6055
CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
6156
CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
@@ -66,16 +61,6 @@ all: $(BUILD_DIR) env
6661
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
6762
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
6863

69-
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
70-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
71-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
72-
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
73-
74-
cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
75-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE_10x) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
76-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
77-
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
78-
7964
cuda110_nomatmul: $(BUILD_DIR) env
8065
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
8166
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
@@ -122,11 +107,6 @@ env:
122107
@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
123108
@echo "============================"
124109

125-
cutlass:
126-
if [ ! -d "$(ROOT_DIR)/dependencies/cutlass" ]; then \
127-
git clone https://github.com/NVIDIA/cutlass.git $(ROOT_DIR)/dependencies/cutlass; \
128-
fi \
129-
130110
$(BUILD_DIR):
131111
mkdir -p build
132112
mkdir -p dependencies

bitsandbytes/functional.py

Lines changed: 0 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,6 @@ def __init__(self):
128128

129129
def initialize(self):
130130
self.context = {}
131-
# prev_device = torch.cuda.current_device()
132-
# for i in range(torch.cuda.device_count()):
133-
# torch.cuda.set_device(torch.device('cuda', i))
134-
# self.context.append(ct.c_void_p(lib.get_context()))
135-
# torch.cuda.set_device(prev_device)
136131

137132
@classmethod
138133
def get_instance(cls):
@@ -238,72 +233,8 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True):
238233
return values
239234
else:
240235
l = values.numel()//2
241-
#return torch.Tensor(values[:l].tolist() + [-1e-6]*((gap//2)-1) + [0]*2 + [1e-6]*((gap//2)-1) + values[l:].tolist())
242236
return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist())
243237

244-
def create_custom_map(seed=0, scale=0.01):
245-
v = [12, 10, 8, 6, 3, 2, 1]
246-
# 16-bit 7B 22.33, 4-bit best 22.88, FP4 23.25, 4-bit 95 22.97, 4-bit evo 22.45
247-
# 16-bit 13B 70.35, 4-bit best 67.16, FP4 100.78, 4-bit-95 69.39, 4-bit evo 70.48
248-
249-
# 13B 100 steps:
250-
# - 4-bit evo: 86.02
251-
# - 4-bit norm: 78.73
252-
# - 4-bit FP4:
253-
# - 16-bit:
254-
255-
# interval search on normal distribution
256-
#v = [3.090232306167813, 1.4589770349449647, 1.064410327932115, 0.7896806653244509, 0.5646884166925807, 0.3653406435875121, 0.17964844284441311] # 0.999 26.5
257-
#v = [2.3263478740408408, 1.4050715603096329, 1.0364333894937898, 0.7721932141886848, 0.5533847195556727, 0.3584587932511938, 0.1763741647808615] # 0.99 24.99
258-
#v = [1.6448536269514722, 1.2040469600267016, 0.9208229763683788, 0.6971414348463417, 0.5039653672113453, 0.3280721075316511, 0.16184416680396213] # 0.95 24.53 22.97
259-
#v = [1.4050715603096329, 1.0803193408149558, 0.8416212335729143, 0.643345405392917, 0.4676987991145084, 0.3054807880993974, 0.1509692154967774] # 0.92 24.81
260-
#v = [1.2815515655446004, 1.0062699858608395, 0.7916386077433746, 0.6084981344998837, 0.4438613119262478, 0.29050677112339396, 0.14372923370582416] # 0.9 24.68
261-
#v = [1.8807936081512509, 1.2980047163986055, 0.9769954022693226, 0.7341502955472268, 0.5285136765472481, 0.343225833559403, 0.16910470304375366] # 0.97 25.03
262-
#v = [1.7506860712521692, 1.2496468758017434, 0.9485350408266378, 0.7155233557034365, 0.5162006366043174, 0.3356393360829622, 0.16547334454641704] # 0.96 24.85 23.01
263-
#v = [1.5547735945968535, 1.1608220210715001, 0.893800631179489, 0.6789921163940618, 0.4918050830048072, 0.3205236191093902, 0.15821711945563585] # 0.94 24.47
264-
#v = [1.475791028179171, 1.1196635980209986, 0.8674156943957149, 0.6610637542614526, 0.4797170937629045, 0.31299335020578195, 0.15459215234139795] # 0.93 24.85
265-
#v = [1.5981931399228175, 1.1821583959486879, 0.9072289939325966, 0.6880384454306778, 0.49787602226482025, 0.3242955535308664, 0.160030379970179] # 0.945 24.287
266-
##v = [1.6164363711150211, 1.1908453913294612, 0.9126463450304729, 0.6916727602238111, 0.5003095327012462, 0.3258056171348078, 0.1607558311941979] # 0.947 24.293
267-
#v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.207
268-
#v = [1.6118251211466303, 1.188665228776879, 0.9112895004060624, 0.690763326564427, 0.4997008778346997, 0.3254280317127771, 0.16057446047146948] # 0.9465 24.30
269-
#v = [1.6027040905517569, 1.184321770169049, 0.9085808314549837, 0.6889461706317986, 0.4984841229538408, 0.32467299997597887, 0.1602117348657326] # 0.9455 24.293
270-
#v = [1.6072478919002173, 1.1864907014855421, 0.9099343314196248, 0.6898544638558411, 0.4990924080314459, 0.32505049268156666, 0.16039309503073892] # 0.946 24.37 22.88
271-
272-
# 7B evo start
273-
#v = [1.62129629, 1.18870191, 0.90848106, 0.69108646, 0.50515268, 0.34927819905, 0.14122701] # 22.06
274-
#v = [1.6143079205628337, 1.1888081407660314, 0.8990131955745421, 0.694373759813679, 0.5083033257326773, 0.3452499746844963, 0.1148939728228951]
275-
#v = [1.614442766030303, 1.189401918639665, 0.8998038168964273, 0.6953094818279475, 0.5073264599048384, 0.3449003790823619, 0.11428378427205564]
276-
277-
# 13B evo start
278-
#v = [1.6077535089716468, 1.1914902148179205, 0.8999752421085561, 0.6967904489387543, 0.4949093928311768, 0.30920472033044544, 0.15391602735952042]
279-
#v = [1.586363722436466, 1.202610827188916, 0.9003332576346587, 0.6904888715206972, 0.49490974688233724, 0.2971151461329376, 0.15683230810738283]
280-
v = [1.5842247437829478, 1.2037228884260156, 0.900369059187269, 0.6898587137788914, 0.4949097822874533, 0.2959061887131868, 0.15712393618216908]
281-
282-
# mean evo 7B + 13B
283-
#v = [1.5993337549066253, 1.1965624035328402, 0.9000864380418481, 0.6925840978034195, 0.5011181210961458, 0.32040328389777434, 0.13570386022711237]
284-
285-
# theoretically optiomal (0.93333)
286-
#v = [1.501085946044025, 1.1331700302595604, 0.8761428492468408, 0.6670160135425023, 0.48373855304610314, 0.3155014472579608, 0.15580024666388428] # 0.9333333333333333
287-
288-
if seed > 0:
289-
v = np.array(v)
290-
np.random.seed(seed)
291-
v += np.random.randn(7)*scale
292-
print(v.tolist())
293-
#v[0] += (np.random.randn(1)*0.001)[0]
294-
#v[-1] += (np.random.randn(1)*0.001)[0]
295-
#print(v[0], v[-1])
296-
v = v.tolist()
297-
values = v + [0]*(256-14) + \
298-
v[::-1]
299-
300-
values = torch.Tensor(values)
301-
values[0:7] *= -1
302-
values = values.sort().values
303-
values /= values.max()
304-
assert values.numel() == 256
305-
return values
306-
307238
def create_normal_map(offset=0.9677083, use_extra_value=True):
308239

309240
if use_extra_value:

tests/test_functional.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,21 +1773,24 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):
17731773
print("partial matmul", time.time() - t0)
17741774

17751775

1776-
batch_size = 2
1777-
seqdim = 2048
1776+
batch_size = 1
1777+
seqdim = 1
17781778
values = []
1779-
values.append((batch_size, seqdim, 768, 4 * 768))
1779+
#values.append((batch_size, seqdim, 768, 4 * 768))
17801780
#values.append((batch_size, seqdim, 1024, 4*1024))
17811781
#values.append((batch_size, seqdim, 1536, 4*1536))
17821782
#values.append((batch_size, seqdim, 2048, 4*2048))
17831783
#values.append((batch_size, seqdim, 2560, 4*2560))
1784-
#values.append((batch_size, seqdim, 4096, 4*4096))
1784+
values.append((batch_size, seqdim, 4096, 4*4096))
1785+
values.append((batch_size, seqdim, 5120, 4*5120))
1786+
values.append((batch_size, seqdim, 6656, 4*6656))
1787+
values.append((batch_size, seqdim, 8192, 4*8192))
17851788
#values.append((batch_size, seqdim, 5140, 4*5140))
17861789
#values.append((batch_size, seqdim, 12288, 4*12288))
17871790
names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values]
17881791
@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
17891792
def test_bench_matmul(batch, seq, model, hidden):
1790-
iters = 1
1793+
iters = 80
17911794
formatB = F.get_special_format_str()
17921795

17931796
A = torch.randn(batch, seq, model, device="cuda").half()
@@ -1799,14 +1802,14 @@ def test_bench_matmul(batch, seq, model, hidden):
17991802

18001803
B_nf4, state_nf4= F.quantize_nf4(B)
18011804

1802-
linear8bit = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half()
1805+
linear8bit = bnb.nn.Linear8bitLt(model, hidden, False, False).cuda().half()
18031806
linear8bit.eval()
18041807

18051808
outliers = torch.randint(0, model, size=(5,)).cuda()
18061809
A[:, :, outliers] = 8.0
18071810

1808-
linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half())
1809-
linearMixedBit.eval()
1811+
linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, False, threshold=6.0).cuda().half())
1812+
#linearMixedBit.eval()
18101813

18111814
linear8bit_train = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half()
18121815
linear8bit_train_thresh = bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half()
@@ -1898,21 +1901,21 @@ def test_bench_matmul(batch, seq, model, hidden):
18981901
#torch.cuda.synchronize()
18991902
#print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
19001903

1901-
#linear8bit(A)
1902-
#torch.cuda.synchronize()
1903-
#t0 = time.time()
1904-
#for i in range(iters):
1905-
# linear8bit(A)
1906-
#torch.cuda.synchronize()
1907-
#print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
1904+
linear8bit(A)
1905+
torch.cuda.synchronize()
1906+
t0 = time.time()
1907+
for i in range(iters):
1908+
linear8bit(A)
1909+
torch.cuda.synchronize()
1910+
print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
19081911

1909-
#linearMixedBit(A)
1910-
#torch.cuda.synchronize()
1911-
#t0 = time.time()
1912-
#for i in range(iters):
1913-
# linearMixedBit(A)
1914-
#torch.cuda.synchronize()
1915-
#print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
1912+
linearMixedBit(A)
1913+
torch.cuda.synchronize()
1914+
t0 = time.time()
1915+
for i in range(iters):
1916+
linearMixedBit(A)
1917+
torch.cuda.synchronize()
1918+
print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
19161919

19171920
#linear8bit_train(A)
19181921
#torch.cuda.synchronize()

0 commit comments

Comments
 (0)