Serialize mx.

stamalakhov · stamalakhov · commit 0253cb936282 · 2026-01-19T17:15:33.000+03:00
TICO-DCO-1.0-Signed-off-by: s.malakhov &lt;s.malakhov@partner.samsung.com&gt;
diff --git a/tico/passes/decompose_fake_quantize.py b/tico/passes/decompose_fake_quantize.py
@@ -123,6 +123,27 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     node.replace_all_uses_with(dequnt, propagate_meta=True)
                 modified = True
+                
+            if node.target in [torch.ops.circle_custom.quantize_mx.default]:
+                # tensor, scale, zero_p, quant_min, quant_max
+                assert len(node.args) == 3
+                _, elem_format, axis = node.args
+
+                with gm.graph.inserting_before(node):
+                    quant = create_node(
+                        g,
+                        torch.ops.circle_custom.quantize_float_to_mx.default,
+                        args=node.args,
+                        origin=node,
+                    )
+                    dequnt = create_node(
+                        g,
+                        torch.ops.circle_custom.dequantize_mx_to_float.default,
+                        args=(quant, *quant.args[1:]),
+                        kwargs=quant.kwargs,
+                    )
+                    node.replace_all_uses_with(dequnt, propagate_meta=True)
+                modified = True
 
         gm.graph.eliminate_dead_code()
         gm.graph.lint()
diff --git a/tico/quantization/passes/fold_quant_ops.py b/tico/quantization/passes/fold_quant_ops.py
@@ -144,7 +144,78 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     dq.replace_all_uses_with(op, propagate_meta=False)
                     logger.debug(f"Removed redundant {dq.name}")
+            
+        for dq in graph.nodes:
+            if dq.op != "call_function":
+                continue
+            if (
+                dq.target
+                != torch.ops.circle_custom.dequantize_mx_to_float.default
+            ):
+                continue
+            
+            dq_args = dq.args
+
+            q = dq_args[0]
+            if q.target != torch.ops.circle_custom.quantize_float_to_mx.default:
+                continue
+            q_args = q.args
+            op = q_args[0]
+
+            # Check if Q and DQ have same parameters
+            if q_args[1] != dq_args[1]:
+                continue
+            if q_args[2] != dq_args[2]:
+                continue
+
+            # ───────────────────────────────────────────
+            # Case 1: op not yet quantized
+            # ───────────────────────────────────────────
+            if QPARAM_KEY not in op.meta:
+                #TODO
+                qparam = QuantParam()
+                qparam.dtype = "mxint8"# q_args[1] #TODO
+                qparam.quantized_dimension = q_args[2]
+                op.meta[QPARAM_KEY] = qparam
+
+                dq.replace_all_uses_with(op, propagate_meta=False)
+
+                logger.debug(f"{q.name} and {dq.name} are folded to {op.name}.")
+            # ───────────────────────────────────────────
+            # Case 2: op already quantized
+            #        2.1 same dtype  → nothing to do
+            #        2.2 diff dtype  → leave Q in place
+            # ───────────────────────────────────────────
+            else:
+                op_qparam: QuantParam = op.meta[QPARAM_KEY]
+                qdq_dtype = "mxint8"#q_args[1] #TODO
+
+                if op_qparam.dtype != qdq_dtype:
+                    # Attach QPARAM to Q once
+                    if QPARAM_KEY not in q.meta:
+                        qparam = QuantParam()
+                        qparam.dtype = qdq_dtype
+                        qparam.quantized_dimension = q_args[2]
+                        q.meta[QPARAM_KEY] = qparam
+                        assert len(q.users) == 1, "Fix me unless"
 
+                    dq.replace_all_uses_with(q, propagate_meta=False)
+                    logger.debug(f"{dq.name} is folded ({q.name} is left).")
+                else:
+                    # Same dtype → the Quantize–Dequantize pair is redundant.
+                    assert not op_qparam.scale
+                    assert not op_qparam.zero_point
+                    assert (
+                        op_qparam.dtype
+                        and op_qparam.dtype == 'mxint8' #TODO
+                    )
+                    assert (
+                        op_qparam.quantized_dimension is not None
+                        and op_qparam.quantized_dimension == q_args[2]
+                    )
+                    dq.replace_all_uses_with(op, propagate_meta=False)
+                    logger.debug(f"Removed redundant {dq.name}")
+                    
         graph.eliminate_dead_code()
         graph.lint()
         graph_module.recompile()
diff --git a/tico/quantization/wrapq/examples/quantize_linear.py b/tico/quantization/wrapq/examples/quantize_linear.py
@@ -36,6 +36,10 @@
 from tico.quantization.wrapq.mode import Mode
 from tico.quantization.wrapq.wrappers.nn.quant_linear import QuantLinear
 from tico.utils.utils import SuppressWarning
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
 
 
 # -------------------------------------------------------------------------
@@ -62,7 +66,14 @@ def forward(self, x):
 # -------------------------------------------------------------------------
 # 1. Replace the Linear with QuantLinear wrapper
 # -------------------------------------------------------------------------
-model.fc = prepare(fp32_layer, PTQConfig())  # type: ignore[assignment]
+cfg = PTQConfig(
+        default_dtype=DType.uint(8),
+        default_qscheme=QScheme.PER_TENSOR_ASYMM,
+        default_observer=MXObserver,#MinMaxObserver,
+        overrides = {"weight": {"dtype": DType.uint(4), "observer": MinMaxObserver}},
+)
+        
+model.fc = prepare(fp32_layer,cfg)  # type: ignore[assignment]
 qlayer = model.fc  # alias for brevity
 
 # -------------------------------------------------------------------------
diff --git a/tico/quantization/wrapq/examples/quantize_llama_mlp.py b/tico/quantization/wrapq/examples/quantize_llama_mlp.py
@@ -27,6 +27,11 @@
 from tico.quantization.wrapq.qscheme import QScheme
 from tico.quantization.wrapq.wrappers.llama.quant_mlp import QuantLlamaMLP
 from tico.utils.utils import SuppressWarning
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
 
 name = "Maykeye/TinyLLama-v0"
 model = AutoModelForCausalLM.from_pretrained(name)
@@ -37,8 +42,19 @@
 # 1. Replace layer-0’s MLP with QuantLlamaMLP
 # -------------------------------------------------------------------------
 fp32_mlp = model.model.layers[0].mlp
+cfg = PTQConfig(
+    default_dtype=DType.int(16),
+    default_qscheme=QScheme.PER_TENSOR_SYMM,
+    default_observer=MXObserver, #MinMaxObserver,
+    overrides={
+        "gate_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+        "up_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+        "down_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+    }
+)
+    
 model.model.layers[0].mlp = prepare(
-    fp32_mlp, PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
+    fp32_mlp, cfg#PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
 )
 model.eval()
 
diff --git a/tico/quantization/wrapq/examples/quantize_llama_whole_decoder_layer.py b/tico/quantization/wrapq/examples/quantize_llama_whole_decoder_layer.py
@@ -38,13 +38,14 @@
 from tico.quantization.wrapq.dtypes import DType
 from tico.quantization.wrapq.mode import Mode
 from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
 from tico.quantization.wrapq.qscheme import QScheme
 from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer import (
     QuantLlamaDecoderLayer,
 )
 from tico.utils.utils import SuppressWarning
 
-MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"  # "Maykeye/TinyLLama-v0"  # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct" #"Maykeye/TinyLLama-v0"   # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME, cache_dir="/mnt/storage/transformers_cache"
 )
@@ -64,20 +65,23 @@
 cfg = PTQConfig(
     default_dtype=DType.int(16),
     default_qscheme=QScheme.PER_TENSOR_SYMM,
-    default_observer=MinMaxObserver,
+    default_observer=MXObserver, #MinMaxObserver
     overrides={
         # local override: input observer now MinMax & 4-bit, per-channel asymmetric
         "mlp": {
-            "gate_proj": {"weight": {"dtype": DType.uint(4)}},
-            "up_proj": {"weight": {"dtype": DType.uint(4)}},
-            "down_proj": {"weight": {"dtype": DType.uint(4)}},
+            "gate_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "up_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "down_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
         },
         "self_attn": {
-            "q_proj": {"weight": {"dtype": DType.uint(4)}},
-            "k_proj": {"weight": {"dtype": DType.uint(4)}},
-            "v_proj": {"weight": {"dtype": DType.uint(4)}},
-            "o_proj": {"weight": {"dtype": DType.uint(4)}},
+            "q_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "k_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "v_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "o_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "scale": {"observer":MinMaxObserver},
         },
+        "input_layernorm" : {},
+        "post_attention_layernorm" : {},
     },
 )
 
diff --git a/tico/quantization/wrapq/examples/quantize_with_gptq.py b/tico/quantization/wrapq/examples/quantize_with_gptq.py
@@ -42,6 +42,7 @@
 from tico.quantization.wrapq.dtypes import DType
 from tico.quantization.wrapq.observers.affine_base import AffineObserverBase
 from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
 from tico.quantization.wrapq.qscheme import QScheme
 from tico.quantization.wrapq.utils.introspection import build_fqn_map
 from tico.quantization.wrapq.utils.metrics import perplexity
@@ -246,26 +247,26 @@ def main():
     print("Wrapping layers with PTQWrapper …")
     w_cfg = {
         "mlp": {
-            "gate_proj": {"weight": {"dtype": DType.uint(4)}},
-            "up_proj": {"weight": {"dtype": DType.uint(4)}},
-            "down_proj": {"weight": {"dtype": DType.uint(4)}},
+            "gate_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "up_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "down_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
         },
         "self_attn": {
-            "q_proj": {"weight": {"dtype": DType.uint(4)}},
-            "k_proj": {"weight": {"dtype": DType.uint(4)}},
-            "v_proj": {"weight": {"dtype": DType.uint(4)}},
-            "o_proj": {"weight": {"dtype": DType.uint(4)}},
+            "q_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "k_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "v_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "o_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
         },
     }
     cfg = PTQConfig(
-        default_dtype=DType.int(16),
+        default_dtype=DType.int(8),
         default_qscheme=QScheme.PER_TENSOR_SYMM,
-        default_observer=MinMaxObserver,
+        default_observer=MXObserver,#MinMaxObserver,
         overrides={
             "model.embeddings": {
-                "weight": {"dtype": DType.uint(8)}
+                "weight": {"dtype": DType.uint(8), "observer":MinMaxObserver},
             },  # embeddings to 8-bits
-            "lm_head": {"weight": {"dtype": DType.uint(8)}},  # lm_head to 8-bits
+            "lm_head": {"weight": {"dtype": DType.uint(8), "observer":MinMaxObserver}},  # lm_head to 8-bits
         },
     )
     for i in range(len(q_m.model.layers)):
diff --git a/tico/serialize/circle_mapping.py b/tico/serialize/circle_mapping.py
@@ -63,6 +63,8 @@ def str_to_circle_dtype(
         "int64": circle.TensorType.TensorType.INT64,
         "bool": circle.TensorType.TensorType.BOOL,
         "uint4": circle.TensorType.TensorType.UINT4,
+        "mxint8": circle.TensorType.TensorType.MXINT8,
+        "mxfp4": circle.TensorType.TensorType.MXFP4,
         # TODO Add more dtypes
     }
 
diff --git a/tico/utils/register_custom_op.py b/tico/utils/register_custom_op.py
@@ -703,7 +703,80 @@ def _(
         round: str = "nearest",  # Fixed
     ) -> torch.Tensor:
         return input_
+    
+def CircleQuantizeFloatToMX():
+    #TODO
+    @custom_op("circle_custom::quantize_float_to_mx", mutates_args=())
+    def quantize_mx(
+        input_: torch.Tensor,
+        elem_format: str,
+        axis: int,
+        shared_exp_method: str = "max",
+        round: str = "nearest",
+    ) -> torch.Tensor:
+        if elem_format == "int8":
+            scale_bits = 8
+            block_size = 32
+        else:
+            raise RuntimeError(f"Unsupported elem_format in quantize_mx: {elem_format}")
+
+        result = _quantize_mx(
+            input_,
+            scale_bits=scale_bits,
+            elem_format=elem_format,
+            axes=[axis],
+            block_size=block_size,
+            shared_exp_method=shared_exp_method,
+            round=round,
+        )
+        return result
+
+    @register_fake("circle_custom::quantize_float_to_mx")
+    def _(
+        input_: torch.Tensor,
+        elem_format: str,
+        axis: int,
+        shared_exp_method: str = "max",  # Fixed
+        round: str = "nearest",  # Fixed
+    ) -> torch.Tensor:
+        return input_
 
+def CircleDeQuantizeMXToFloat():
+    #TODO
+    @custom_op("circle_custom::dequantize_mx_to_float", mutates_args=())
+    def quantize_mx(
+        input_: torch.Tensor,
+        elem_format: str,
+        axis: int,
+        shared_exp_method: str = "max",
+        round: str = "nearest",
+    ) -> torch.Tensor:
+        if elem_format == "int8":
+            scale_bits = 8
+            block_size = 32
+        else:
+            raise RuntimeError(f"Unsupported elem_format in quantize_mx: {elem_format}")
+
+        result = _quantize_mx(
+            input_,
+            scale_bits=scale_bits,
+            elem_format=elem_format,
+            axes=[axis],
+            block_size=block_size,
+            shared_exp_method=shared_exp_method,
+            round=round,
+        )
+        return result
+
+    @register_fake("circle_custom::dequantize_mx_to_float")
+    def _(
+        input_: torch.Tensor,
+        elem_format: str,
+        axis: int,
+        shared_exp_method: str = "max",  # Fixed
+        round: str = "nearest",  # Fixed
+    ) -> torch.Tensor:
+        return input_
 
 def CircleRMSNorm():
     @custom_op("circle_custom::rms_norm", mutates_args=())
@@ -800,6 +873,8 @@ def RegisterOps():
     CircleAvgPool2D()
     CircleInstanceNorm()
     CircleQuantizeMX()
+    CircleQuantizeFloatToMX()
+    CircleDeQuantizeMXToFloat()
     CircleRMSNorm()
     CircleAttention()
     CircleShape()
diff --git a/tico/utils/utils.py b/tico/utils/utils.py
@@ -268,6 +268,8 @@ def has_quantization_ops(graph: torch.fx.Graph):
         torch.ops.quantized_decomposed.quantize_per_channel.default,
         torch.ops.quantized_decomposed.dequantize_per_tensor.default,
         torch.ops.quantized_decomposed.dequantize_per_channel.default,
+        torch.ops.circle_custom.quantize_float_to_mx.default,
+        torch.ops.circle_custom.dequantize_mx_to_float.default,
     ]
     for node in graph.nodes:
         if node.op != "call_function":

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,8 @@ def str_to_circle_dtype(`
`63`	`63`	`"int64": circle.TensorType.TensorType.INT64,`
`64`	`64`	`"bool": circle.TensorType.TensorType.BOOL,`
`65`	`65`	`"uint4": circle.TensorType.TensorType.UINT4,`
	`66`	`+ "mxint8": circle.TensorType.TensorType.MXINT8,`
	`67`	`+ "mxfp4": circle.TensorType.TensorType.MXFP4,`
`66`	`68`	`# TODO Add more dtypes`
`67`	`69`	`}`
`68`	`70`
Original file line number	Diff line number	Diff line change
`@@ -268,6 +268,8 @@ def has_quantization_ops(graph: torch.fx.Graph):`
`268`	`268`	`torch.ops.quantized_decomposed.quantize_per_channel.default,`
`269`	`269`	`torch.ops.quantized_decomposed.dequantize_per_tensor.default,`
`270`	`270`	`torch.ops.quantized_decomposed.dequantize_per_channel.default,`
	`271`	`+ torch.ops.circle_custom.quantize_float_to_mx.default,`
	`272`	`+ torch.ops.circle_custom.dequantize_mx_to_float.default,`
`271`	`273`	`]`
`272`	`274`	`for node in graph.nodes:`
`273`	`275`	`if node.op != "call_function":`