pytorch
diff --git a/‎test/core/test_config.py
Lines changed: 8 additions & 6 deletions b/‎test/core/test_config.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 52 additions & 24 deletions b/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 52 additions & 24 deletions
diff --git a/‎test/float8/test_base.py
Lines changed: 3 additions & 3 deletions b/‎test/float8/test_base.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/integration/test_loading_deprecated_checkpoint.py
Lines changed: 65 additions & 0 deletions b/‎test/integration/test_loading_deprecated_checkpoint.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎test/quantization/quantize_/workflows/float8/test_float8_tensor.py
Lines changed: 7 additions & 22 deletions b/‎test/quantization/quantize_/workflows/float8/test_float8_tensor.py
Lines changed: 7 additions & 22 deletions
@@ -7,6 +7,7 @@
 import json
 import os
 import tempfile
+import warnings
 from dataclasses import dataclass
 from unittest import mock
 
@@ -15,7 +16,6 @@
 
 from torchao.core.config import (
     AOBaseConfig,
-    VersionMismatchError,
     config_from_dict,
     config_to_dict,
 )
@@ -176,7 +176,7 @@ def test_disallowed_modules():
 
 
 def test_version_mismatch():
-    """Test that version mismatch raises an error during reconstruction."""
+    """Test that version mismatch prints a warning during reconstruction."""
     # Create a config
     dummy_config = DummyNonAllowedConfig()
     reconstructable = config_to_dict(dummy_config)
@@ -186,11 +186,13 @@ def test_version_mismatch():
 
     # Patch to allow the module but should still fail due to version mismatch
     with mock.patch("torchao.core.config.ALLOWED_AO_MODULES", {__name__}):
-        with pytest.raises(
-            VersionMismatchError,
-            match="Version mismatch for DummyNonAllowedConfig: stored version 1 != current version 2",
-        ):
+        with warnings.catch_warnings(record=True) as caught_warnings:
             config_from_dict(reconstructable)
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for version mismatch"
 
 
 def test_default_version():
 
@@ -30,17 +30,14 @@
 from torchao.float8.float8_utils import compute_error
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
-    float8_dynamic_activation_float8_weight,
-    float8_weight_only,
+    Float8StaticActivationFloat8WeightConfig,
+    Float8WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.granularity import (
     PerRow,
     PerTensor,
 )
-from torchao.quantization.quant_api import (
-    float8_static_activation_float8_weight,
-)
 from torchao.quantization.quant_primitives import (
     MappingType,
     _choose_scale_float8,
@@ -119,11 +116,13 @@ def test_fp8_linear_variants(
             )
             mode_map = {
                 "dynamic": partial(
-                    float8_dynamic_activation_float8_weight, granularity=granularity
+                    Float8DynamicActivationFloat8WeightConfig,
+                    granularity=granularity,
+                    VERSION=1,
                 ),
-                "weight-only": float8_weight_only,
+                "weight-only": partial(Float8WeightOnlyConfig, VERSION=1),
                 "static": partial(
-                    float8_static_activation_float8_weight,
+                    Float8StaticActivationFloat8WeightConfig,
                     scale=scale,
                     granularity=granularity,
                 ),
@@ -152,7 +151,7 @@ def test_fp8_linear_variants(
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
-            float8_dynamic_activation_float8_weight(granularity="invalid")
+            Float8DynamicActivationFloat8WeightConfig(granularity="invalid", VERSION=1)
 
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -162,7 +161,9 @@ def test_mismatched_granularity(self):
             ValueError,
             match="Different granularities for activation and weight are not supported",
         ):
-            float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow()))
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=(PerTensor(), PerRow()), VERSION=1
+            )
 
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -172,8 +173,9 @@ class UnsupportedGranularity:
             pass
 
         with pytest.raises(ValueError, match="Invalid granularity types"):
-            float8_dynamic_activation_float8_weight(
-                granularity=(UnsupportedGranularity(), UnsupportedGranularity())
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
+                VERSION=1,
             )
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -187,7 +189,10 @@ def test_per_row_with_float32(self):
         ):
             model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
-                model, float8_dynamic_activation_float8_weight(granularity=PerRow())
+                model,
+                Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerRow(), VERSION=1
+                ),
             )
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -201,11 +206,13 @@ def test_serialization(self, mode: str):
 
         mode_map = {
             "dynamic": partial(
-                float8_dynamic_activation_float8_weight, granularity=PerTensor()
+                Float8DynamicActivationFloat8WeightConfig,
+                granularity=PerTensor(),
+                VERSION=1,
             ),
-            "weight-only": float8_weight_only,
+            "weight-only": partial(Float8WeightOnlyConfig, VERSION=1),
             "static": partial(
-                float8_static_activation_float8_weight,
+                Float8StaticActivationFloat8WeightConfig,
                 scale=torch.tensor(1.0, dtype=torch.float32, device="cuda"),
                 granularity=PerTensor(),
             ),
@@ -275,7 +282,10 @@ def test_fp8_weight_dimension_warning(self):
             "torchao.quantization.quant_api", level="INFO"
         ) as log_context:
             quantize_(
-                model, float8_dynamic_activation_float8_weight(granularity=PerTensor())
+                model,
+                Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerTensor(), VERSION=1
+                ),
             )
             print(model)
 
@@ -320,7 +330,8 @@ def test_mm_float8dq_per_row(
         )
         test_linear = copy.deepcopy(ref_linear)
         quantize_(
-            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            test_linear,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         quant_weight = test_linear.weight
@@ -472,7 +483,10 @@ def test_float8_tensor_slicing_basic(self, granularity):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         weight_impl = model.weight.original_weight_tensor.tensor_impl
@@ -506,7 +520,10 @@ def test_float8_tensor_slicing_per_tensor(self):
         # Create and quantize with per-tensor granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -537,7 +554,8 @@ def test_float8_tensor_slicing_per_row(self):
         # Create and quantize with per-row granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         original_weight = model.weight  # Shape: (32, 64)
@@ -575,7 +593,10 @@ def test_float8_tensor_slicing_edge_cases(self):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -613,7 +634,9 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
         quant_model = copy.deepcopy(ref_model)
         quantize_(
             quant_model,
-            Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         # Create input with batch size that works well with slicing
@@ -743,7 +766,12 @@ def test_expected_kernels_on_gpu(self, granularity, torch_compile_mode):
         m = torch.nn.Sequential(
             torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
         )
-        quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=granularity))
+        quantize_(
+            m,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
+        )
         m = torch.compile(m, mode=torch_compile_mode)
         x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
 
 
@@ -473,10 +473,10 @@ def test_quantize(self):
         m = nn.Sequential(nn.Linear(32, 32)).cuda()
         m = convert_to_float8_training(m)
         assert isinstance(m[0], Float8Linear), "Module is not a Float8Linear"
-        from torchao.quantization.quant_api import float8_weight_only, quantize_
+        from torchao.quantization import Float8WeightOnlyConfig, quantize_
 
-        quantize_(m, float8_weight_only())
-        assert m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn, (
+        quantize_(m, Float8WeightOnlyConfig())
+        assert m[0].weight.qdata.dtype == torch.float8_e4m3fn, (
             "Post quantization dtype should be torch.float8_e4m3fn"
         )
         with torch.no_grad():
 
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+import warnings
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from torchao.utils import is_sm_at_least_89
+
+_MODEL_NAMES = [
+    "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev",
+]
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_89(), "Nedd sm89+")
+class TestLoadingDeprecatedCheckpoint(TestCase):
+    @common_utils.parametrize("model_name", _MODEL_NAMES)
+    def test_load_model_and_run(self, model_name):
+        """Test that we print correct warning message when loading a deprecated checkpoint"""
+        # Load and quantize model
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            quantized_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for version mismatch"
+
+            assert any(
+                "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Didn't get expected warning message for deprecation"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        prompt = ("Hello, my name is",)
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).to("cuda")
+        generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+        # make sure it runs
+        _ = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+
+common_utils.instantiate_parametrized_tests(TestLoadingDeprecatedCheckpoint)
+
+if __name__ == "__main__":
+    run_tests()
@@ -184,7 +184,6 @@ def test_fp8_linear_variants(
                 config = Float8DynamicActivationFloat8WeightConfig(
                     granularity=granularity,
                     kernel_preference=kernel_preference,
-                    VERSION=2,
                 )
             else:
                 assert mode == "weight-only", f"Unsupported mode: {mode}"
@@ -210,9 +209,7 @@ def test_fp8_linear_variants(
         "AssertionError: tensor(False, device='cuda:0') is not true : sqnr: -2.90625, will fix a bit later",
     )
     def test_slice(self, granularity):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
         dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
@@ -273,9 +270,7 @@ def test_slice(self, granularity):
 
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_slice_preserves_aliasing(self, granularity):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
@@ -296,9 +291,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
 
         dtype = torch.bfloat16
         device = "cuda"
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         l = torch.nn.Linear(1024, 1024, device="cuda", dtype=dtype)
         quantize_(l, config)
 
@@ -335,9 +328,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
     def test_bmm(self):
         # only support per row quantization
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=PerRow(), VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
 
         class M(torch.nn.Module):
             def __init__(self, weight):
@@ -369,9 +360,7 @@ def forward(self, x):
         ],
     )
     def test_to_device(self, granularity, sizes):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         M, N, K = sizes
         dtype = torch.bfloat16
         for device in self.GPU_DEVICES:
@@ -401,9 +390,7 @@ def test_to_device(self, granularity, sizes):
         ],
     )
     def test_cat(self, granularity, sizes):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
         M, N, K = sizes
@@ -461,9 +448,7 @@ def test_moe_weight_reshape_ops(self):
         dtype = torch.bfloat16
         device = "cuda"
 
-        bmm_config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        bmm_config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         moe_config = MoEQuantConfig(bmm_config)
 
         batch_size = 4