diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py index cdc6f6fe5a..aae2e5bad6 100644 --- a/benchmarks/benchmark_aq.py +++ b/benchmarks/benchmark_aq.py @@ -20,6 +20,7 @@ Int4WeightOnlyQuantizedLinearWeight, Int8WeightOnlyQuantizedLinearWeight, ) +from torchao.testing.model_architectures import ToySingleLinearModel from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_4, TORCH_VERSION_AT_LEAST_2_5, @@ -62,32 +63,6 @@ def _int4wo_api(mod, **kwargs): change_linear_weights_to_int4_woqtensors(mod, **kwargs) -class ToyLinearModel(torch.nn.Module): - """Single linear for m * k * n problem size""" - - def __init__( - self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda" - ): - super().__init__() - self.m = m - self.dtype = dtype - self.device = device - self.linear = torch.nn.Linear(k, n, bias=has_bias).to( - dtype=self.dtype, device=self.device - ) - - def example_inputs(self): - return ( - torch.randn( - self.m, self.linear.in_features, dtype=self.dtype, device=self.device - ), - ) - - def forward(self, x): - x = self.linear(x) - return x - - def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs): """ The deprecated implementation for int8 dynamic quant API, used as a reference for @@ -151,7 +126,7 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None): if kwargs is None: kwargs = {} - m = ToyLinearModel( + m = ToySingleLinearModel( M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda" ).eval() m_bf16 = copy.deepcopy(m) diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst index 02b59c2430..f957ca478f 100644 --- a/docs/source/quick_start.rst +++ b/docs/source/quick_start.rst @@ -29,19 +29,9 @@ First, let's set up our toy model: import copy import torch + from torchao.testing.model_architectures import ToyMultiLinearModel - class ToyLinearModel(torch.nn.Module): - def __init__(self, m: int, n: int, k: int): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - model = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda") + model = ToyMultiLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda") # Optional: compile model for faster inference and generation model = torch.compile(model, mode="max-autotune", fullgraph=True) diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index 5e0c42f901..5f3b2f9cd1 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -7,7 +7,7 @@ Serialization and deserialization flow ====================================== Here is the serialization and deserialization flow:: - + import copy import tempfile import torch @@ -16,23 +16,10 @@ Here is the serialization and deserialization flow:: quantize_, Int4WeightOnlyConfig, ) - - class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - - def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"): - return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x + from torchao.testing.model_architectures import ToyMultiLinearModel dtype = torch.bfloat16 - m = ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to("cuda") + m = ToyMultiLinearModel(1024, 1024, 1024).eval().to(dtype).to("cuda") print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB") example_inputs = m.example_inputs(dtype=dtype, device="cuda") @@ -46,7 +33,7 @@ Here is the serialization and deserialization flow:: state_dict = torch.load(f) with torch.device("meta"): - m_loaded = ToyLinearModel(1024, 1024, 1024).eval().to(dtype) + m_loaded = ToyMultiLinearModel(1024, 1024, 1024).eval().to(dtype) # `linear.weight` is nn.Parameter, so we check the type of `linear.weight.data` print(f"type of weight before loading: {type(m_loaded.linear1.weight.data), type(m_loaded.linear2.weight.data)}") @@ -62,7 +49,7 @@ What happens when serializing an optimized model? To serialize an optimized model, we just need to call ``torch.save(m.state_dict(), f)``, because in torchao, we use tensor subclass to represent different dtypes or support different optimization techniques like quantization and sparsity. So after optimization, the only thing change is the weight Tensor is changed to an optimized weight Tensor, and the model structure is not changed at all. For example: original floating point model ``state_dict``:: - + {"linear1.weight": float_weight1, "linear2.weight": float_weight2} quantized model ``state_dict``:: @@ -75,14 +62,14 @@ The size of the quantized model is typically going to be smaller to the original original model size: 4.0 MB quantized model size: 1.0625 MB - + What happens when deserializing an optimized model? =================================================== To deserialize an optimized model, we can initialize the floating point model in `meta `__ device and then load the optimized ``state_dict`` with ``assign=True`` using `model.load_state_dict `__:: with torch.device("meta"): - m_loaded = ToyLinearModel(1024, 1024, 1024).eval().to(dtype) + m_loaded = ToyMultiLinearModel(1024, 1024, 1024).eval().to(dtype) print(f"type of weight before loading: {type(m_loaded.linear1.weight), type(m_loaded.linear2.weight)}") m_loaded.load_state_dict(state_dict, assign=True) @@ -97,5 +84,3 @@ We can also verify that the weight is properly loaded by checking the type of we type of weight before loading: (, ) type of weight after loading: (, ) - - diff --git a/scripts/quick_start.py b/scripts/quick_start.py index 55c17a8684..87b3b02690 100644 --- a/scripts/quick_start.py +++ b/scripts/quick_start.py @@ -8,6 +8,7 @@ import torch from torchao.quantization import Int4WeightOnlyConfig, quantize_ +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_5, benchmark_model, @@ -18,20 +19,7 @@ # | Set up model | # ================ - -class ToyLinearModel(torch.nn.Module): - def __init__(self, m: int, n: int, k: int): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - -model = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda") +model = ToyMultiLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda") # Optional: compile model for faster inference and generation model = torch.compile(model, mode="max-autotune", fullgraph=True) diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py index 1dfed4dda8..49f7dc5639 100644 --- a/test/dtypes/test_affine_quantized_float.py +++ b/test/dtypes/test_affine_quantized_float.py @@ -45,6 +45,7 @@ _quantize_affine_float8, choose_qparams_affine, ) +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_90, @@ -55,18 +56,6 @@ torch.manual_seed(0) -class ToyLinearModel(torch.nn.Module): - def __init__(self, in_features, out_features): - super().__init__() - self.linear1 = torch.nn.Linear(in_features, out_features, bias=False) - self.linear2 = torch.nn.Linear(out_features, in_features, bias=False) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - class TestAffineQuantizedFloat8Compile(InductorTestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @unittest.skipIf( @@ -129,7 +118,7 @@ def test_fp8_linear_variants( } # Create a linear layer with bfloat16 dtype - model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") + model = ToyMultiLinearModel(K, N).eval().to(dtype).to("cuda") quantized_model = copy.deepcopy(model) factory = mode_map[mode]() @@ -186,7 +175,7 @@ def test_per_row_with_float32(self): AssertionError, match="PerRow quantization only works for bfloat16 precision", ): - model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda") + model = ToyMultiLinearModel(64, 64).eval().to(torch.float32).to("cuda") quantize_( model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()), @@ -199,7 +188,7 @@ def test_per_row_with_float32(self): @common_utils.parametrize("mode", ["dynamic", "weight-only", "static"]) def test_serialization(self, mode: str): # Create and quantize the model - model = ToyLinearModel(16, 32).to(device="cuda") + model = ToyMultiLinearModel(16, 32).to(device="cuda") mode_map = { "dynamic": partial( @@ -231,7 +220,7 @@ def test_serialization(self, mode: str): # Create a new model and load the state dict with torch.device("meta"): - new_model = ToyLinearModel(16, 32) + new_model = ToyMultiLinearModel(16, 32) if mode == "static": quantize_(new_model, factory) new_model.load_state_dict(loaded_state_dict, assign=True) @@ -273,7 +262,7 @@ def test_serialization(self, mode: str): ) def test_fp8_weight_dimension_warning(self): # Create model with incompatible dimensions (not multiples of 16) - model = ToyLinearModel(10, 25).cuda() # 10x25 and 25x10 weights + model = ToyMultiLinearModel(10, 25).cuda() # 10x25 and 25x10 weights # Set up logging capture with self.assertLogs( diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index 5514228f4b..af3d5ff2db 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -2134,28 +2134,12 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype): class TestBenchmarkModel(unittest.TestCase): - class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - - def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"): - return ( - torch.randn( - batch_size, self.linear1.in_features, dtype=dtype, device=device - ), - ) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x + from torchao.testing.model_architectures import ToyMultiLinearModel def run_benchmark_model(self, device): # params dtype = torch.bfloat16 - m = self.ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to(device) + m = self.ToyMultiLinearModel(1024, 1024, 1024).eval().to(dtype).to(device) m_bf16 = copy.deepcopy(m) example_inputs = m.example_inputs(dtype=dtype, device=device) m_bf16 = torch.compile(m_bf16, mode="max-autotune") diff --git a/test/prototype/test_awq.py b/test/prototype/test_awq.py index 5538fa513d..ded2bb8227 100644 --- a/test/prototype/test_awq.py +++ b/test/prototype/test_awq.py @@ -15,36 +15,13 @@ from torchao.prototype.awq import AWQConfig, AWQStep from torchao.quantization import FbgemmConfig, Int4WeightOnlyConfig, quantize_ +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_6, _is_fbgemm_genai_gpu_available, ) -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=512, n=256, k=128): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - self.linear3 = torch.nn.Linear(k, 64, bias=False) - - def example_inputs( - self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda" - ): - return [ - torch.randn( - 1, sequence_length, self.linear1.in_features, dtype=dtype, device=device - ) - for j in range(batch_size) - ] - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - x = self.linear3(x) - return x - - @unittest.skipIf(not torch.cuda.is_available(), reason="CUDA not available") @unittest.skipIf( not _is_fbgemm_genai_gpu_available(), @@ -77,7 +54,7 @@ def test_awq_functionality(self): n_calibration_examples = 10 sequence_length = 5 - m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) + m = ToyMultiLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) # baseline quantization base_config = FbgemmConfig( @@ -126,7 +103,7 @@ def test_awq_loading(self): n_calibration_examples = 10 sequence_length = 5 - m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) + m = ToyMultiLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) dataset = m.example_inputs( dataset_size, sequence_length=sequence_length, @@ -158,7 +135,7 @@ def test_awq_loading(self): f.seek(0) state_dict = torch.load(f) - loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) + loaded_model = ToyMultiLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) loaded_model.load_state_dict(state_dict, assign=True) m = torch.compile(m, fullgraph=True) @@ -186,7 +163,7 @@ def test_awq_loading_vllm(self): n_calibration_examples = 10 sequence_length = 5 - m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) + m = ToyMultiLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) dataset = m.example_inputs( dataset_size, sequence_length=sequence_length, @@ -218,7 +195,7 @@ def test_awq_loading_vllm(self): f.seek(0) state_dict = torch.load(f) - loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) + loaded_model = ToyMultiLinearModel(l1, l2, l3).eval().to(original_dtype).to(device) quant_config = AWQConfig(base_config, step=AWQStep.PREPARE_FOR_LOADING) quantize_(loaded_model, quant_config) diff --git a/test/prototype/test_smoothquant.py b/test/prototype/test_smoothquant.py index 568b2d964f..492e10afa6 100644 --- a/test/prototype/test_smoothquant.py +++ b/test/prototype/test_smoothquant.py @@ -25,30 +25,7 @@ from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_5, ) - - -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=512, n=256, k=128): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - self.linear3 = torch.nn.Linear(k, 1, bias=False) - - def example_inputs( - self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda" - ): - return [ - torch.randn( - 1, sequence_length, self.linear1.in_features, dtype=dtype, device=device - ) - for j in range(batch_size) - ] - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - x = self.linear3(x) - return x +from torchao.testing.model_architectures import ToyMultiLinearModel @unittest.skipIf(torch.version.hip is not None, "Skipping tests in ROCm") @@ -189,7 +166,7 @@ def test_save_load_recipe(self, alpha, quant_mode, device, input_dtype): sequence_length = 5 # Create two identical models for comparison - m = ToyLinearModel(*layer_dims).eval().to(input_dtype).to(device) + m = ToyMultiLinearModel(*layer_dims).eval().to(input_dtype).to(device) m_save_load = deepcopy(m) # Generate calibration dataset diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py index cc55299074..767e46d5bf 100644 --- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py +++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py @@ -24,6 +24,7 @@ ) from torchao.quantization.quantize_.common import KernelPreference from torchao.quantization.utils import compute_error +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.testing.utils import TorchAOIntegrationTestCase from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_8, @@ -36,18 +37,6 @@ torch._dynamo.config.cache_size_limit = 128 -class ToyLinearModel(torch.nn.Module): - def __init__(self, in_features, out_features): - super().__init__() - self.linear1 = torch.nn.Linear(in_features, out_features, bias=False) - self.linear2 = torch.nn.Linear(out_features, in_features, bias=False) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - # TODO: move tests in test_affine_quantized_float.py here after we migrated all implementations @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @@ -113,7 +102,7 @@ def test_fp8_linear_variants( input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") # Create a linear layer with bfloat16 dtype - model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") + model = ToyMultiLinearModel(K, N).eval().to(dtype).to("cuda") quantized_model = copy.deepcopy(model) @@ -222,7 +211,7 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes): dtype = torch.bfloat16 input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") # Create a linear layer with bfloat16 dtype - model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") + model = ToyMultiLinearModel(K, N).eval().to(dtype).to("cuda") # reference kernel preference and results # we are using KerenelPreference.TORCH as the reference diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index b9d99e7ac7..1df82b235c 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -64,6 +64,7 @@ Int8WeightOnlyQuantizedLinearWeight, ) from torchao.quantization.utils import compute_error +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.testing.utils import skip_if_rocm from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_3, @@ -131,25 +132,6 @@ def quantize(self, model: torch.nn.Module) -> torch.nn.Module: return model -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64, bias=False): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=bias).to(torch.float) - self.linear2 = torch.nn.Linear(n, k, bias=bias).to(torch.float) - - def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"): - return ( - torch.randn( - batch_size, self.linear1.in_features, dtype=dtype, device=device - ), - ) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs): """ The deprecated implementation for int8 dynamic quant API, used as a reference for @@ -211,7 +193,7 @@ class TestQuantFlow(TestCase): ) def test_dynamic_quant_gpu_singleline(self): - m = ToyLinearModel().eval() + m = ToyMultiLinearModel().eval() example_inputs = m.example_inputs() quantize_(m, int8_dynamic_activation_int8_weight()) m(*example_inputs) @@ -225,7 +207,7 @@ def test_dynamic_quant_gpu_singleline(self): @unittest.skip("skipping for now due to torch.compile error") def test_dynamic_quant_gpu_unified_api_unified_impl(self): quantizer = XNNPackDynamicQuantizer() - m = ToyLinearModel().eval() + m = ToyMultiLinearModel().eval() example_inputs = m.example_inputs() m = quantizer.prepare(m) m = quantizer.convert(m) @@ -242,7 +224,7 @@ def test_dynamic_quant_gpu_unified_api_unified_impl(self): ) def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self): quantizer = TorchCompileDynamicQuantizer() - m = ToyLinearModel().eval() + m = ToyMultiLinearModel().eval() example_inputs = m.example_inputs() m = quantizer.quantize(m) quantized = m(*example_inputs) @@ -253,7 +235,7 @@ def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self): @unittest.skipIf(not torch.xpu.is_available(), "Need XPU available") @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "only works for torch 2.8+") def test_int4_wo_quant_save_load(self): - m = ToyLinearModel().eval().cpu() + m = ToyMultiLinearModel().eval().cpu() def api(model): quantize_(model, int4_weight_only(layout=Int4XPULayout())) @@ -268,7 +250,7 @@ def api(model): f.seek(0) state_dict = torch.load(f) - m2 = ToyLinearModel().eval().cpu() + m2 = ToyMultiLinearModel().eval().cpu() api(m2) m2.load_state_dict(state_dict) @@ -281,7 +263,7 @@ def api(model): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "only works for torch 2.4+") def test_int8_wo_quant_save_load(self): - m = ToyLinearModel().eval().cpu() + m = ToyMultiLinearModel().eval().cpu() def api(model): quantize_(model, int8_weight_only()) @@ -296,7 +278,7 @@ def api(model): f.seek(0) state_dict = torch.load(f) - m2 = ToyLinearModel().eval().cpu() + m2 = ToyMultiLinearModel().eval().cpu() api(m2) m2.load_state_dict(state_dict) @@ -316,7 +298,7 @@ def test_8da4w_quantizer(self): from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer quantizer = Int8DynActInt4WeightQuantizer(groupsize=32) - m = ToyLinearModel().eval() + m = ToyMultiLinearModel().eval() example_inputs = m.example_inputs() m = quantizer.quantize(m) assert isinstance(m.linear1, Int8DynActInt4WeightLinear) @@ -331,7 +313,7 @@ def test_8da4w_quantizer_linear_bias(self): from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer quantizer = Int8DynActInt4WeightQuantizer(groupsize=32) - m = ToyLinearModel(bias=True).eval() + m = ToyMultiLinearModel(bias=True).eval() example_inputs = m.example_inputs() m = quantizer.quantize(m) assert isinstance(m.linear1, Int8DynActInt4WeightLinear) @@ -450,7 +432,7 @@ def test_eval_wrapper_llama3(self): ) def test_quantized_tensor_subclass_8da4w(self, mapping_type): group_size = 32 - m = ToyLinearModel().eval() + m = ToyMultiLinearModel().eval() m_copy = copy.deepcopy(m) example_inputs = m.example_inputs() quantize_( @@ -490,7 +472,7 @@ def test_quantized_tensor_subclass_8da4w(self, mapping_type): def test_quantized_tensor_subclass_int4(self): for device in self.GPU_DEVICES: # use 1024 so that we don't need padding - m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to(device) + m = ToyMultiLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to(device) m_copy = copy.deepcopy(m) example_inputs = m.example_inputs(dtype=torch.bfloat16, device=device) @@ -515,7 +497,7 @@ def test_quantized_tensor_subclass_int4(self): @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_quantized_tensor_subclass_int8_wo(self): - m = ToyLinearModel().eval().to(torch.bfloat16) + m = ToyMultiLinearModel().eval().to(torch.bfloat16) m_copy = copy.deepcopy(m) example_inputs = tuple(map(lambda x: x.to(torch.bfloat16), m.example_inputs())) @@ -537,7 +519,7 @@ def test_quantized_tensor_subclass_int8_wo(self): @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.5 and below") def test_quantized_tensor_subclass_int8_dyn_quant(self): # use multiples of 1024 so that we don't need padding - m = ToyLinearModel(1024, 1024, 2048).eval().to(torch.bfloat16).to("cuda") + m = ToyMultiLinearModel(1024, 1024, 2048).eval().to(torch.bfloat16).to("cuda") m_copy = copy.deepcopy(m) # setting batch_size to 20 to be compatible with the kernel example_inputs = m.example_inputs( @@ -578,7 +560,7 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self): @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_quantized_tensor_subclass_save_load(self): - m = ToyLinearModel().eval().to(torch.bfloat16) + m = ToyMultiLinearModel().eval().to(torch.bfloat16) m_copy = copy.deepcopy(m) example_inputs = m.example_inputs(dtype=torch.bfloat16) @@ -597,7 +579,7 @@ def test_quantized_tensor_subclass_save_load(self): @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_int8wo_quantized_model_to_device(self): - m = ToyLinearModel().eval().to(torch.bfloat16) + m = ToyMultiLinearModel().eval().to(torch.bfloat16) example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu") quantize_(m, int8_weight_only()) @@ -615,7 +597,7 @@ def test_int4wo_quantized_model_to_device(self): # TODO: change initial model to "cpu" devices = ["cuda", "cuda:0"] for device in devices: - m = ToyLinearModel().eval().to(torch.bfloat16).to(device) + m = ToyMultiLinearModel().eval().to(torch.bfloat16).to(device) example_inputs = m.example_inputs(dtype=torch.bfloat16, device=device) quantize_(m, int4_weight_only()) @@ -629,7 +611,7 @@ def test_int4wo_quantized_model_to_device(self): @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_quantized_tensor_subclass_save_load_map_location(self): - m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device="cuda") + m = ToyMultiLinearModel().eval().to(dtype=torch.bfloat16, device="cuda") example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda") quantize_(m, int8_weight_only()) @@ -640,7 +622,7 @@ def test_quantized_tensor_subclass_save_load_map_location(self): state_dict = torch.load(f.name, map_location="cpu", mmap=True) with torch.device("meta"): - m_copy = ToyLinearModel().eval() + m_copy = ToyMultiLinearModel().eval() m_copy.load_state_dict(state_dict, assign=True) m_copy.to(dtype=torch.bfloat16, device="cuda") @@ -657,13 +639,13 @@ def reset_memory(): torch.cuda.reset_peak_memory_stats() reset_memory() - m = ToyLinearModel() + m = ToyMultiLinearModel() quantize_(m.to(device="cuda"), int8_weight_only()) memory_baseline = torch.cuda.max_memory_allocated() del m reset_memory() - m = ToyLinearModel() + m = ToyMultiLinearModel() quantize_(m, int8_weight_only(), device="cuda") memory_streaming = torch.cuda.max_memory_allocated() @@ -677,7 +659,7 @@ def reset_memory(): @common_utils.parametrize("use_hqq", [True, False]) def test_int4wo_cpu(self, dtype, x_dim, use_hqq): device = "cpu" - m = ToyLinearModel().eval().to(dtype).to(device) + m = ToyMultiLinearModel().eval().to(dtype).to(device) example_inputs = m.example_inputs(dtype=dtype, device=device) if x_dim == 3: example_inputs = (example_inputs[0].unsqueeze(0),) @@ -772,7 +754,7 @@ def test_module_fqn_to_config_default(self): config1 = Int4WeightOnlyConfig(group_size=32) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"_default": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyMultiLinearModel().cuda().to(dtype=torch.bfloat16) example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) @@ -786,7 +768,7 @@ def test_module_fqn_to_config_module_name(self): config1 = Int4WeightOnlyConfig(group_size=32) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"linear1": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyMultiLinearModel().cuda().to(dtype=torch.bfloat16) example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) @@ -831,7 +813,7 @@ def test_module_fqn_to_config_embedding_linear(self): def test_module_fqn_to_config_skip(self): config1 = Int4WeightOnlyConfig(group_size=32) config = ModuleFqnToConfig({"_default": config1, "linear2": None}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyMultiLinearModel().cuda().to(dtype=torch.bfloat16) example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) @@ -842,7 +824,7 @@ def test_module_fqn_to_config_skip(self): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_int4wo_cuda_serialization(self): config = Int4WeightOnlyConfig(group_size=32) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyMultiLinearModel().cuda().to(dtype=torch.bfloat16) # quantize in cuda quantize_(model, config) example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) diff --git a/test/sparsity/test_fast_sparse_training.py b/test/sparsity/test_fast_sparse_training.py index 804a585dd8..5cfd83d204 100644 --- a/test/sparsity/test_fast_sparse_training.py +++ b/test/sparsity/test_fast_sparse_training.py @@ -15,22 +15,10 @@ swap_linear_with_semi_sparse_linear, swap_semi_sparse_linear_with_linear, ) +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, is_fbcode -class ToyModel(nn.Module): - def __init__(self): - super().__init__() - self.linear1 = nn.Linear(128, 256, bias=False) - self.linear2 = nn.Linear(256, 128, bias=False) - - def forward(self, x): - x = self.linear1(x) - x = torch.nn.functional.relu(x) - x = self.linear2(x) - return x - - class TestRuntimeSemiStructuredSparsity(TestCase): @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "pytorch 2.4+ feature") @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @@ -42,7 +30,7 @@ def test_runtime_weight_sparsification(self): input = torch.rand((128, 128)).half().cuda() grad = torch.rand((128, 128)).half().cuda() - model = ToyModel().half().cuda() + model = ToyMultiLinearModel().half().cuda() model_c = copy.deepcopy(model) for name, mod in model.named_modules(): @@ -91,7 +79,7 @@ def test_runtime_weight_sparsification_compile(self): input = torch.rand((128, 128)).half().cuda() grad = torch.rand((128, 128)).half().cuda() - model = ToyModel().half().cuda() + model = ToyMultiLinearModel().half().cuda() model_c = copy.deepcopy(model) for name, mod in model.named_modules(): diff --git a/test/test_model_architecture.py b/test/test_model_architecture.py index 973939a56a..1ed6513c8a 100644 --- a/test/test_model_architecture.py +++ b/test/test_model_architecture.py @@ -45,7 +45,7 @@ def test_transformer_block(self, device): self.skipTest("CUDA not available") model, input_data = create_model_and_input_data( - "transformer_block", 10, 64, 32, device=device + "transformer_block", 10, 64, 32 ) output = model(input_data) self.assertEqual(output.shape, (10, 16, 64)) diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md index 47ecb9aabe..bf6cf4f615 100644 --- a/torchao/quantization/README.md +++ b/torchao/quantization/README.md @@ -276,20 +276,7 @@ from torchao.quantization.quant_api import ( quantize_, Int4WeightOnlyConfig, ) - -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64): - super().__init__() - self.linear1 = torch.nn.Linear(m, n, bias=False) - self.linear2 = torch.nn.Linear(n, k, bias=False) - - def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"): - return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x +from torchao.testing.model_architectures import ToyLinearModel dtype = torch.bfloat16 m = ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to("cuda") diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py index 0d038605fa..4c81847e30 100644 --- a/torchao/testing/model_architectures.py +++ b/torchao/testing/model_architectures.py @@ -11,14 +11,53 @@ import torch.nn.functional as F -# TODO: Refactor torchao and tests to use these models -class ToyLinearModel(torch.nn.Module): - def __init__(self, k=64, n=32, dtype=torch.bfloat16): +class ToySingleLinearModel(torch.nn.Module): + """Single linear for m * k * n problem size""" + + def __init__( + self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda" + ): + super().__init__() + self.m = m + self.dtype = dtype + self.device = device + self.linear = torch.nn.Linear(k, n, bias=has_bias).to( + dtype=self.dtype, device=self.device + ) + + def example_inputs(self): + return ( + torch.randn( + self.m, self.linear.in_features, dtype=self.dtype, device=self.device + ), + ) + + def forward(self, x): + x = self.linear(x) + return x + + +class ToyMultiLinearModel(torch.nn.Module): + def __init__(self, m=512, n=256, k=128, has_bias=False): super().__init__() - self.linear1 = torch.nn.Linear(k, n, bias=False).to(dtype) + self.linear1 = torch.nn.Linear(m, n, bias=has_bias) + self.linear2 = torch.nn.Linear(n, k, bias=has_bias) + self.linear3 = torch.nn.Linear(k, 64, bias=has_bias) + + def example_inputs( + self, batch_size=1, sequence_length=10, dtype=torch.bfloat16, device="cuda" + ): + return [ + torch.randn( + 1, sequence_length, self.linear1.in_features, dtype=dtype, device=device + ) + for _ in range(batch_size) + ] def forward(self, x): x = self.linear1(x) + x = self.linear2(x) + x = self.linear3(x) return x diff --git a/tutorials/calibration_flow/awq_like.py b/tutorials/calibration_flow/awq_like.py index 2e36626fed..9d3ffeda1e 100644 --- a/tutorials/calibration_flow/awq_like.py +++ b/tutorials/calibration_flow/awq_like.py @@ -44,6 +44,7 @@ register_quantize_module_handler, ) from torchao.quantization.utils import compute_error +from torchao.testing.model_architectures import ToyMultiLinearModel class ObservedLinear(torch.nn.Linear): @@ -165,31 +166,12 @@ def weight_quant_func(weight): ######## Test ########## -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64): - super().__init__() - self.linear1 = torch.nn.Linear(m, k, bias=False) - self.linear2 = torch.nn.Linear(k, n, bias=False) - - def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"): - return ( - torch.randn( - batch_size, self.linear1.in_features, dtype=dtype, device=device - ), - ) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - def test_awq(target_dtype: torch.dtype, mapping_type: MappingType): print(f"Testing {target_dtype} static quantization:") torch.manual_seed(0) dtype = torch.bfloat16 - m = ToyLinearModel().eval().to(dtype).to("cuda") + m = ToyMultiLinearModel().eval().to(dtype).to("cuda") m_bf16 = copy.deepcopy(m) example_inputs = m.example_inputs(dtype=dtype, device="cuda") diff --git a/tutorials/calibration_flow/static_quant.py b/tutorials/calibration_flow/static_quant.py index d81b00de8d..b9d9a6a61b 100644 --- a/tutorials/calibration_flow/static_quant.py +++ b/tutorials/calibration_flow/static_quant.py @@ -37,6 +37,7 @@ register_quantize_module_handler, ) from torchao.quantization.utils import compute_error +from torchao.testing.model_architectures import ToyMultiLinearModel from torchao.utils import is_sm_at_least_90 @@ -241,31 +242,12 @@ def apply_static_quant( return QuantizedLinear.from_observed(module, config.target_dtype) -class ToyLinearModel(torch.nn.Module): - def __init__(self, m=64, n=32, k=64): - super().__init__() - self.linear1 = torch.nn.Linear(m, k, bias=False) - self.linear2 = torch.nn.Linear(k, n, bias=False) - - def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"): - return ( - torch.randn( - batch_size, self.linear1.in_features, dtype=dtype, device=device - ), - ) - - def forward(self, x): - x = self.linear1(x) - x = self.linear2(x) - return x - - def test_static_quant(target_dtype: torch.dtype, mapping_type: MappingType): print(f"Testing {target_dtype} static quantization:") torch.manual_seed(0) dtype = torch.bfloat16 - m = ToyLinearModel().eval().to(dtype).to("cuda") + m = ToyMultiLinearModel().eval().to(dtype).to("cuda") m_bf16 = copy.deepcopy(m) example_inputs = m.example_inputs(dtype=dtype, device="cuda")