Skip to content

Commit 008899d

Browse files
Formatting
1 parent 6de923a commit 008899d

File tree

6 files changed

+17
-15
lines changed

6 files changed

+17
-15
lines changed

benchmarking/int8/int8_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,4 @@
6565
print("=" * 40)
6666
print(f"Example:\n{tokenizer.decode(generated_ids[0])}")
6767
print("=" * 40)
68-
print(f"Speed: {num/(time.time() - time_1)}token/s")
68+
print(f"Speed: {num / (time.time() - time_1)}token/s")

benchmarking/matmul_benchmark.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_bench_matmul(batch, seq, model, hidden):
6666
torch.matmul(A, B.t())
6767
torch.cuda.synchronize()
6868
print(
69-
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s",
69+
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s",
7070
)
7171

7272
# torch.cuda.synchronize()
@@ -88,22 +88,24 @@ def test_bench_matmul(batch, seq, model, hidden):
8888
for i in range(iters):
8989
bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4)
9090
torch.cuda.synchronize()
91-
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
91+
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s")
9292

9393
torch.cuda.synchronize()
9494
t0 = time.time()
9595
for i in range(iters):
9696
bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c)
9797
torch.cuda.synchronize()
98-
print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
98+
print(
99+
f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
100+
)
99101

100102
torch.cuda.synchronize()
101103
t0 = time.time()
102104
for i in range(iters):
103105
bnb.matmul(A, B)
104106
torch.cuda.synchronize()
105107
print(
106-
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
108+
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
107109
)
108110

109111
torch.cuda.synchronize()
@@ -112,7 +114,7 @@ def test_bench_matmul(batch, seq, model, hidden):
112114
bnb.matmul(A, B, threshold=6.0)
113115
torch.cuda.synchronize()
114116
print(
115-
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
117+
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
116118
)
117119

118120
CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
@@ -124,7 +126,7 @@ def test_bench_matmul(batch, seq, model, hidden):
124126
out32 = F.int8_linear_matmul(CA, CB)
125127
torch.cuda.synchronize()
126128
print(
127-
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
129+
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
128130
)
129131

130132
# C32A, SA = F.transform(CA, "col32")
@@ -183,7 +185,7 @@ def test_bench_matmul(batch, seq, model, hidden):
183185
linear8bit(A)
184186
torch.cuda.synchronize()
185187
print(
186-
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
188+
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
187189
)
188190

189191
linearMixedBit(A)
@@ -193,7 +195,7 @@ def test_bench_matmul(batch, seq, model, hidden):
193195
linearMixedBit(A)
194196
torch.cuda.synchronize()
195197
print(
196-
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
198+
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
197199
)
198200

199201
# linear8bit_train(A)

bitsandbytes/optim/optimizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ def init_state(self, group, p, gindex, pindex):
450450
elif config["optim_bits"] == 8:
451451
dtype = torch.uint8
452452
else:
453-
raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}')
453+
raise NotImplementedError(f"Amount of optimizer bits not supported: {config['optim_bits']}")
454454

455455
if p.numel() < config["min_8bit_size"]:
456456
dtype = torch.float32
@@ -677,7 +677,7 @@ def init_state(self, group, p, gindex, pindex):
677677
elif config["optim_bits"] == 8:
678678
dtype = torch.uint8
679679
else:
680-
raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}')
680+
raise NotImplementedError(f"Amount of optimizer bits not supported: {config['optim_bits']}")
681681

682682
if p.numel() < config["min_8bit_size"]:
683683
dtype = torch.float32

bitsandbytes/triton/matmul_perf_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def estimate_matmul_time(
128128
print(
129129
f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
130130
f"loading time: {load_ms}ms, store time: {store_ms}ms, "
131-
f"Activate CTAs: {active_cta_ratio*100}%"
131+
f"Activate CTAs: {active_cta_ratio * 100}%"
132132
)
133133
return total_time_ms
134134

examples/int8_inference_huggingface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
tokenizer = LlamaTokenizer.from_pretrained(model_name)
99
input_ids = tokenizer(text, return_tensors="pt").input_ids
1010

11-
max_memory = f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB"
11+
max_memory = f"{int(torch.cuda.mem_get_info()[0] / 1024**3) - 2}GB"
1212

1313
n_gpus = torch.cuda.device_count()
1414
max_memory = {i: max_memory for i in range(n_gpus)}

tests/test_functional.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -674,12 +674,12 @@ def test_int8_double_quant(self, dim1, dim2):
674674
min_error = 1 / 500
675675
if num_not_close_cols > (min_error * n):
676676
print(
677-
f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols/n:.4f}"
677+
f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols / n:.4f}"
678678
)
679679
assert False
680680
if num_not_close_rows > (min_error * n):
681681
print(
682-
f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows/n:.4f}"
682+
f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows / n:.4f}"
683683
)
684684
assert False
685685

0 commit comments

Comments
 (0)