@@ -151,6 +151,34 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
151151 assert relerr < 0.012
152152 assert A2 .dtype == dtype
153153
154+ @pytest .mark .parametrize ("device" , get_available_devices (no_cpu = True ))
155+ @pytest .mark .skipif (not get_available_devices (no_cpu = True ), reason = "No accelerator device" )
156+ @pytest .mark .parametrize ("dtype" , [torch .float32 , torch .float16 , torch .bfloat16 ], ids = describe_dtype )
157+ @pytest .mark .parametrize ("blocksize" , [256 ], ids = id_formatter ("blocksize" ))
158+ def test_dynamic_blockwise_quantization_large (self , device , dtype , blocksize ):
159+ """
160+ Test that we can successfully quantize a large tensor. Note that the following limitations apply:
161+ - On CUDA/XPU/ROCm, the maximum number of elements is limited to 2**31 - 1 due to int32 indexing in C++ kernels.
162+ - On CPU, there is a significantly higher memory overhead for the quantization, so we skip this test.
163+ - Verification of the accuracy for dequantization has too high memory overhead for this test.
164+ """
165+ if device not in ["cuda" , "xpu" ]:
166+ pytest .skip ("This test is only for CUDA and XPU devices due to memory constraints." )
167+
168+ data = torch .randn (2 ** 31 - 1 , device = device , dtype = dtype )
169+ q_data , q_stats = F .quantize_blockwise (data , blocksize = blocksize )
170+
171+ assert q_data is not None
172+ assert q_data .dtype == torch .uint8
173+ assert q_data .numel () == data .numel ()
174+
175+ # Dequant
176+ del data
177+ dq = F .dequantize_blockwise (q_data , q_stats )
178+
179+ assert dq .dtype == dtype
180+ assert dq .numel () == q_data .numel ()
181+
154182 @pytest .mark .skipif ("cpu" not in get_available_devices (), reason = "CPU is required" )
155183 @pytest .mark .parametrize ("hidden" , [128 ])
156184 @pytest .mark .parametrize ("blocksize" , [4096 , 16384 ])
@@ -1118,18 +1146,17 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
11181146 A1 = torch .randn (1024 , 1024 , device = device , dtype = dtype )
11191147 qa , SA = F .quantize_4bit (A1 , blocksize = blocksize , quant_type = quant_type )
11201148 A2 = F .dequantize_4bit (qa , SA , blocksize = blocksize , quant_type = quant_type )
1149+ del qa , SA
1150+
1151+ assert A2 .dtype == dtype
11211152
11221153 err = (A1 - A2 ).abs ().float ()
1154+ del A2
1155+
11231156 relerr = (err / (A1 .abs ().float () + 1e-8 )).mean ()
11241157 err = err .mean ()
11251158
1126- assert A2 .dtype == dtype
1127-
1128- # With larger block sizes, we can expect this to blow up.
1129- # At blocksize>=1024, don't even bother looking at relerr.
1130- #
1131- # Actually, the above is not true anymore after fixing the integer packing bug.
1132- # The following values were taken from averaging 1k samples per test configuration after fixing the bug.
1159+ # The following values were taken from averaging 1k samples per test configuration.
11331160 error_dict = dict ()
11341161 error_dict ["fp4" ] = dict ()
11351162 error_dict ["nf4" ] = dict ()
@@ -1213,6 +1240,37 @@ def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
12131240 assert err .item () < 0.11
12141241 assert relerr .item () < 0.28
12151242
1243+ @pytest .mark .parametrize ("device" , get_available_devices (no_cpu = True ))
1244+ @pytest .mark .skipif (not get_available_devices (no_cpu = True ), reason = "No accelerator device" )
1245+ @pytest .mark .parametrize ("dtype" , [torch .float32 , torch .float16 , torch .bfloat16 ], ids = describe_dtype )
1246+ @pytest .mark .parametrize ("quant_type" , ["fp4" , "nf4" ])
1247+ @pytest .mark .parametrize ("blocksize" , [64 , 128 ] if not HIP_ENVIRONMENT else [128 ], ids = id_formatter ("blocksize" ))
1248+ def test_4bit_quant_large (self , device , dtype , quant_type , blocksize ):
1249+ """
1250+ Test that we can successfully quantize a large tensor. Note that the following limitations apply:
1251+ - On CUDA/XPU/ROCm, the maximum number of elements is limited to 2**31 - 1 due to int32 indexing in C++ kernels.
1252+ - On CUDA, this test requires ~10GiB of memory for fp32
1253+ - On CPU, there is a significantly higher memory overhead for the quantization, so we skip this test.
1254+ - Verification of the accuracy for dequantization has too high memory overhead for this test.
1255+ """
1256+
1257+ if device not in ["cuda" , "xpu" ]:
1258+ pytest .skip ("This test is only for CUDA and XPU devices due to memory constraints." )
1259+
1260+ A1 = torch .randn (2 ** 31 - 1 , device = device , dtype = dtype )
1261+ qa , SA = F .quantize_4bit (A1 , blocksize = blocksize , quant_type = quant_type )
1262+
1263+ assert qa is not None
1264+ assert qa .dtype == torch .uint8
1265+ assert qa .numel () == (2 ** 31 - 1 + 1 ) // 2 # each byte holds 2 quantized values
1266+
1267+ # Dequant
1268+ del A1
1269+ dq = F .dequantize_4bit (qa , SA )
1270+
1271+ assert dq .dtype == dtype
1272+ assert dq .numel () == 2 ** 31 - 1
1273+
12161274 # @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
12171275 @pytest .mark .parametrize ("quant_type" , ["nf4" ])
12181276 @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA is required" )
0 commit comments