pytorch · namgyu-youn · Aug 11, 2025 · Aug 11, 2025 · Aug 12, 2025 · Aug 11, 2025
diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -20,6 +20,7 @@
     Int4WeightOnlyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
 )
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
@@ -62,32 +63,6 @@ def _int4wo_api(mod, **kwargs):
         change_linear_weights_to_int4_woqtensors(mod, **kwargs)
 
 
-class ToyLinearModel(torch.nn.Module):
-    """Single linear for m * k * n problem size"""
-
-    def __init__(
-        self, m=64, n=32, k=64, has_bias=False, dtype=torch.float, device="cuda"
-    ):
-        super().__init__()
-        self.m = m
-        self.dtype = dtype
-        self.device = device
-        self.linear = torch.nn.Linear(k, n, bias=has_bias).to(
-            dtype=self.dtype, device=self.device
-        )
-
-    def example_inputs(self):
-        return (
-            torch.randn(
-                self.m, self.linear.in_features, dtype=self.dtype, device=self.device
-            ),
-        )
-
-    def forward(self, x):
-        x = self.linear(x)
-        return x
-
-
 def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs):
     """
     The deprecated implementation for int8 dynamic quant API, used as a reference for

diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst
@@ -29,17 +29,7 @@ First, let's set up our toy model:
 
   import copy
   import torch
-
-  class ToyLinearModel(torch.nn.Module):
-      def __init__(self, m: int, n: int, k: int):
-          super().__init__()
-          self.linear1 = torch.nn.Linear(m, n, bias=False)
-          self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-      def forward(self, x):
-          x = self.linear1(x)
-          x = self.linear2(x)
-          return x
+  from torchao.testing.model_architectures import ToyLinearModel
 
   model = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
 

diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
@@ -7,7 +7,7 @@ Serialization and deserialization flow
 ======================================
 
 Here is the serialization and deserialization flow::
-  
+
   import copy
   import tempfile
   import torch
@@ -16,20 +16,7 @@ Here is the serialization and deserialization flow::
       quantize_,
       Int4WeightOnlyConfig,
   )
-
-  class ToyLinearModel(torch.nn.Module):
-      def __init__(self, m=64, n=32, k=64):
-          super().__init__()
-          self.linear1 = torch.nn.Linear(m, n, bias=False)
-          self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-      def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
-          return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
-
-      def forward(self, x):
-          x = self.linear1(x)
-          x = self.linear2(x)
-          return x
+  from torchao.testing.model_architectures import ToyLinearModel
 
   dtype = torch.bfloat16
   m = ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to("cuda")
@@ -62,7 +49,7 @@ What happens when serializing an optimized model?
 To serialize an optimized model, we just need to call ``torch.save(m.state_dict(), f)``, because in torchao, we use tensor subclass to represent different dtypes or support different optimization techniques like quantization and sparsity. So after optimization, the only thing change is the weight Tensor is changed to an optimized weight Tensor, and the model structure is not changed at all. For example:
 
 original floating point model ``state_dict``::
-  
+
   {"linear1.weight": float_weight1, "linear2.weight": float_weight2}
 
 quantized model ``state_dict``::
@@ -75,7 +62,7 @@ The size of the quantized model is typically going to be smaller to the original
   original model size: 4.0 MB
   quantized model size: 1.0625 MB
 
-  
+
 What happens when deserializing an optimized model?
 ===================================================
 To deserialize an optimized model, we can initialize the floating point model in `meta <https://pytorch.org/docs/stable/meta.html>`__ device and then load the optimized ``state_dict`` with ``assign=True`` using `model.load_state_dict <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict>`__::
@@ -97,5 +84,3 @@ We can also verify that the weight is properly loaded by checking the type of we
 
   type of weight before loading: (<class 'torch.Tensor'>, <class 'torch.Tensor'>)
   type of weight after loading: (<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>, <class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>)
-
-
diff --git a/scripts/quick_start.py b/scripts/quick_start.py
@@ -8,6 +8,7 @@
 import torch
 
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     benchmark_model,
@@ -18,19 +19,6 @@
 # | Set up model |
 # ================
 
-
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, m: int, n: int, k: int):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(m, n, bias=False)
-        self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 model = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
 
 # Optional: compile model for faster inference and generation

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -45,6 +45,7 @@
     _quantize_affine_float8,
     choose_qparams_affine,
 )
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -55,18 +56,6 @@
 torch.manual_seed(0)
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 class TestAffineQuantizedFloat8Compile(InductorTestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(

diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -2134,23 +2134,7 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype):
 
 
 class TestBenchmarkModel(unittest.TestCase):
-    class ToyLinearModel(torch.nn.Module):
-        def __init__(self, m=64, n=32, k=64):
-            super().__init__()
-            self.linear1 = torch.nn.Linear(m, n, bias=False)
-            self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-        def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
-            return (
-                torch.randn(
-                    batch_size, self.linear1.in_features, dtype=dtype, device=device
-                ),
-            )
-
-        def forward(self, x):
-            x = self.linear1(x)
-            x = self.linear2(x)
-            return x
+    from torchao.testing.model_architectures import ToyLinearModel
 
     def run_benchmark_model(self, device):
         # params

diff --git a/test/prototype/test_awq.py b/test/prototype/test_awq.py
@@ -15,36 +15,13 @@
 
 from torchao.prototype.awq import AWQConfig, AWQStep
 from torchao.quantization import FbgemmConfig, Int4WeightOnlyConfig, quantize_
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_6,
     _is_fbgemm_genai_gpu_available,
 )
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, m=512, n=256, k=128):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(m, n, bias=False)
-        self.linear2 = torch.nn.Linear(n, k, bias=False)
-        self.linear3 = torch.nn.Linear(k, 64, bias=False)
-
-    def example_inputs(
-        self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda"
-    ):
-        return [
-            torch.randn(
-                1, sequence_length, self.linear1.in_features, dtype=dtype, device=device
-            )
-            for j in range(batch_size)
-        ]
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        x = self.linear3(x)
-        return x
-
-
 @unittest.skipIf(not torch.cuda.is_available(), reason="CUDA not available")
 @unittest.skipIf(
     not _is_fbgemm_genai_gpu_available(),

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -31,23 +31,12 @@
     is_sm_at_least_89,
     is_sm_at_least_90,
 )
+from torchao.testing.model_architectures import ToyLinearModel
 
 # Needed since changing args to function causes recompiles
 torch._dynamo.config.cache_size_limit = 128
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
-        self.linear2 = torch.nn.Linear(out_features, in_features, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 # TODO: move tests in test_affine_quantized_float.py here after we migrated all implementations
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -64,6 +64,7 @@
     Int8WeightOnlyQuantizedLinearWeight,
 )
 from torchao.quantization.utils import compute_error
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_3,
@@ -131,25 +132,6 @@ def quantize(self, model: torch.nn.Module) -> torch.nn.Module:
         return model
 
 
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, m=64, n=32, k=64, bias=False):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(m, n, bias=bias).to(torch.float)
-        self.linear2 = torch.nn.Linear(n, k, bias=bias).to(torch.float)
-
-    def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"):
-        return (
-            torch.randn(
-                batch_size, self.linear1.in_features, dtype=dtype, device=device
-            ),
-        )
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
 def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs):
     """
     The deprecated implementation for int8 dynamic quant API, used as a reference for

diff --git a/test/sparsity/test_fast_sparse_training.py b/test/sparsity/test_fast_sparse_training.py
@@ -15,22 +15,10 @@
     swap_linear_with_semi_sparse_linear,
     swap_semi_sparse_linear_with_linear,
 )
+from torchao.testing.model_architectures import ToyLinearModel
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, is_fbcode
 
 
-class ToyModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear1 = nn.Linear(128, 256, bias=False)
-        self.linear2 = nn.Linear(256, 128, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = torch.nn.functional.relu(x)
-        x = self.linear2(x)
-        return x
-
-
 class TestRuntimeSemiStructuredSparsity(TestCase):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "pytorch 2.4+ feature")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -42,7 +30,7 @@ def test_runtime_weight_sparsification(self):
 
         input = torch.rand((128, 128)).half().cuda()
         grad = torch.rand((128, 128)).half().cuda()
-        model = ToyModel().half().cuda()
+        model = ToyLinearModel().half().cuda()
         model_c = copy.deepcopy(model)
 
         for name, mod in model.named_modules():
@@ -91,7 +79,7 @@ def test_runtime_weight_sparsification_compile(self):
 
         input = torch.rand((128, 128)).half().cuda()
         grad = torch.rand((128, 128)).half().cuda()
-        model = ToyModel().half().cuda()
+        model = ToyLinearModel().half().cuda()
         model_c = copy.deepcopy(model)
 
         for name, mod in model.named_modules():

diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -276,20 +276,7 @@ from torchao.quantization.quant_api import (
     quantize_,
     Int4WeightOnlyConfig,
 )
-
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, m=64, n=32, k=64):
-        super().__init__()
-        self.linear1 = torch.nn.Linear(m, n, bias=False)
-        self.linear2 = torch.nn.Linear(n, k, bias=False)
-
-    def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
-        return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
+from torchao.testing.model_architectures import ToyLinearModel
 
 dtype = torch.bfloat16
 m = ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to("cuda")

diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py
@@ -11,14 +11,27 @@
 import torch.nn.functional as F
 
 
-# TODO: Refactor torchao and tests to use these models
 class ToyLinearModel(torch.nn.Module):
-    def __init__(self, k=64, n=32, dtype=torch.bfloat16):
+    def __init__(self, m=512, n=256, k=128):
         super().__init__()
-        self.linear1 = torch.nn.Linear(k, n, bias=False).to(dtype)
+        self.linear1 = torch.nn.Linear(m, n, bias=False)
+        self.linear2 = torch.nn.Linear(n, k, bias=False)
+        self.linear3 = torch.nn.Linear(k, 1, bias=False)
+
+    def example_inputs(
+        self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda"
+    ):
+        return [
+            torch.randn(
+                1, sequence_length, self.linear1.in_features, dtype=dtype, device=device
+            )
+            for j in range(batch_size)
+        ]
 
     def forward(self, x):
         x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
         return x