Formatting

matthewdouglas · matthewdouglas · commit 008899dac33c · 2025-03-26T10:43:12.000-04:00
diff --git a/benchmarking/int8/int8_benchmark.py b/benchmarking/int8/int8_benchmark.py
@@ -65,4 +65,4 @@
 print("=" * 40)
 print(f"Example:\n{tokenizer.decode(generated_ids[0])}")
 print("=" * 40)
-print(f"Speed: {num/(time.time() - time_1)}token/s")
+print(f"Speed: {num / (time.time() - time_1)}token/s")
diff --git a/benchmarking/matmul_benchmark.py b/benchmarking/matmul_benchmark.py
@@ -66,7 +66,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         torch.matmul(A, B.t())
     torch.cuda.synchronize()
     print(
-        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s",
+        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s",
     )
 
     # torch.cuda.synchronize()
@@ -88,22 +88,24 @@ def test_bench_matmul(batch, seq, model, hidden):
     for i in range(iters):
         bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4)
     torch.cuda.synchronize()
-    print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+    print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s")
 
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
         bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c)
     torch.cuda.synchronize()
-    print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+    print(
+        f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
+    )
 
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
         bnb.matmul(A, B)
     torch.cuda.synchronize()
     print(
-        f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     torch.cuda.synchronize()
@@ -112,7 +114,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         bnb.matmul(A, B, threshold=6.0)
     torch.cuda.synchronize()
     print(
-        f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
@@ -124,7 +126,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         out32 = F.int8_linear_matmul(CA, CB)
     torch.cuda.synchronize()
     print(
-        f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     # C32A, SA = F.transform(CA, "col32")
@@ -183,7 +185,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         linear8bit(A)
     torch.cuda.synchronize()
     print(
-        f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     linearMixedBit(A)
@@ -193,7 +195,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         linearMixedBit(A)
     torch.cuda.synchronize()
     print(
-        f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     # linear8bit_train(A)
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
@@ -450,7 +450,7 @@ def init_state(self, group, p, gindex, pindex):
         elif config["optim_bits"] == 8:
             dtype = torch.uint8
         else:
-            raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}')
+            raise NotImplementedError(f"Amount of optimizer bits not supported: {config['optim_bits']}")
 
         if p.numel() < config["min_8bit_size"]:
             dtype = torch.float32
@@ -677,7 +677,7 @@ def init_state(self, group, p, gindex, pindex):
         elif config["optim_bits"] == 8:
             dtype = torch.uint8
         else:
-            raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}')
+            raise NotImplementedError(f"Amount of optimizer bits not supported: {config['optim_bits']}")
 
         if p.numel() < config["min_8bit_size"]:
             dtype = torch.float32
diff --git a/bitsandbytes/triton/matmul_perf_model.py b/bitsandbytes/triton/matmul_perf_model.py
@@ -128,7 +128,7 @@ def estimate_matmul_time(
         print(
             f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
             f"loading time: {load_ms}ms, store time: {store_ms}ms, "
-            f"Activate CTAs: {active_cta_ratio*100}%"
+            f"Activate CTAs: {active_cta_ratio * 100}%"
         )
     return total_time_ms
 
diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py
@@ -8,7 +8,7 @@
 tokenizer = LlamaTokenizer.from_pretrained(model_name)
 input_ids = tokenizer(text, return_tensors="pt").input_ids
 
-max_memory = f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB"
+max_memory = f"{int(torch.cuda.mem_get_info()[0] / 1024**3) - 2}GB"
 
 n_gpus = torch.cuda.device_count()
 max_memory = {i: max_memory for i in range(n_gpus)}
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -674,12 +674,12 @@ def test_int8_double_quant(self, dim1, dim2):
             min_error = 1 / 500
             if num_not_close_cols > (min_error * n):
                 print(
-                    f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols/n:.4f}"
+                    f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols / n:.4f}"
                 )
                 assert False
             if num_not_close_rows > (min_error * n):
                 print(
-                    f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows/n:.4f}"
+                    f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows / n:.4f}"
                 )
                 assert False
 

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ def estimate_matmul_time(`
`128`	`128`	`print(`
`129`	`129`	`f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "`
`130`	`130`	`f"loading time: {load_ms}ms, store time: {store_ms}ms, "`
`131`		`- f"Activate CTAs: {active_cta_ratio*100}%"`
	`131`	`+ f"Activate CTAs: {active_cta_ratio * 100}%"`
`132`	`132`	`)`
`133`	`133`	`return total_time_ms`
`134`	`134`
Original file line number	Diff line number	Diff line change
`@@ -674,12 +674,12 @@ def test_int8_double_quant(self, dim1, dim2):`
`674`	`674`	`min_error = 1 / 500`
`675`	`675`	`if num_not_close_cols > (min_error * n):`
`676`	`676`	`print(`
`677`		`- f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols/n:.4f}"`
	`677`	`+ f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols / n:.4f}"`
`678`	`678`	`)`
`679`	`679`	`assert False`
`680`	`680`	`if num_not_close_rows > (min_error * n):`
`681`	`681`	`print(`
`682`		`- f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows/n:.4f}"`
	`682`	`+ f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows / n:.4f}"`
`683`	`683`	`)`
`684`	`684`	`assert False`
`685`	`685`