Samsung
diff --git a/‎tico/passes/decompose_fake_quantize.py‎
Lines changed: 21 additions & 0 deletions b/‎tico/passes/decompose_fake_quantize.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tico/quantization/passes/fold_quant_ops.py‎
Lines changed: 127 additions & 0 deletions b/‎tico/quantization/passes/fold_quant_ops.py‎
Lines changed: 127 additions & 0 deletions
@@ -123,6 +123,27 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     node.replace_all_uses_with(dequnt, propagate_meta=True)
                 modified = True
+                
+            if node.target in [torch.ops.circle_custom.quantize_mx.default]:
+                # tensor, elem_format, axis
+                assert len(node.args) == 3
+                _, elem_format, axis = node.args
+
+                with gm.graph.inserting_before(node):
+                    quant = create_node(
+                        g,
+                        torch.ops.circle_custom.quantize_mx_decomposed.default,
+                        args=node.args,
+                        origin=node,
+                    )
+                    dequnt = create_node(
+                        g,
+                        torch.ops.circle_custom.dequantize_mx_decomposed.default,
+                        args=(quant, *quant.args[1:]),
+                        kwargs=quant.kwargs,
+                    )
+                    node.replace_all_uses_with(dequnt, propagate_meta=True)
+                modified = True
 
         gm.graph.eliminate_dead_code()
         gm.graph.lint()
 
@@ -30,6 +30,50 @@
     QuantizePerTensorArgs,
 )
 
+import copy
+from tico.utils.graph import create_node
+from tico.utils.utils import quant_min_max, set_new_meta_val
+from tico.quantization.passes.insert_quantize_on_dtype_mismatch import qparam_dtype
+
+def _insert_mx_quantize_op(node, qparam):
+    graph = node.graph
+    assert qparam.quantized_dimension is not None
+    assert qparam.dtype is not None
+    
+    with graph.inserting_after(node):
+        q_args = (node, qparam.dtype, qparam.quantized_dimension)
+        quantize = create_node(
+            graph,
+            torch.ops.circle_custom.quantize_mx_decomposed.default,
+            args=q_args,
+        )
+
+    node.replace_all_uses_with(quantize, propagate_meta=True)
+    quantize.replace_input_with(quantize, node)
+
+    quantize.meta[QPARAM_KEY] = copy.deepcopy(qparam)
+
+    return quantize
+
+def _insert_quantize_op(node, qparam):
+    graph = node.graph
+    min_, max_ = quant_min_max(qparam.dtype)
+    dtype = getattr(torch, qparam.dtype)
+
+    with graph.inserting_after(node):
+        q_args = (node, qparam.scale[0], qparam.zero_point[0], min_, max_, dtype)
+        quantize = create_node(
+            graph,
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+            args=q_args,
+        )
+
+    node.replace_all_uses_with(quantize, propagate_meta=True)
+    quantize.replace_input_with(quantize, node)
+
+    quantize.meta[QPARAM_KEY] = copy.deepcopy(qparam)
+
+    return quantize
 
 @trace_graph_diff_on_pass
 class FoldQuantOps(PassBase):
@@ -114,6 +158,13 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                 dq.replace_all_uses_with(op, propagate_meta=False)
 
                 logger.debug(f"{q.name} and {dq.name} are folded to {op.name}.")
+                assert(QPARAM_KEY not in dq.meta) # we should not abandon quantization calibrated parameters
+               #if QPARAM_KEY in dq.meta: #right now it's not needed
+               #    if (qparam_dtype(op) == "int16" or qparam_dtype(op) == "uint8") and qparam_dtype(dq) == "mxint8":
+               #        #need to insert requantization
+               #        assert(False)
+               #        _insert_mx_quantize_op(op, dq.meta[QPARAM_KEY])
+                    
             # ───────────────────────────────────────────
             # Case 2: op already quantized
             #        2.1 same dtype  → nothing to do
@@ -144,7 +195,83 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     dq.replace_all_uses_with(op, propagate_meta=False)
                     logger.debug(f"Removed redundant {dq.name}")
+            
+        for dq in graph.nodes:
+            if dq.op != "call_function":
+                continue
+            if (
+                dq.target
+                != torch.ops.circle_custom.dequantize_mx_decomposed.default
+            ):
+                continue
+            
+            dq_args = dq.args
+
+            q = dq_args[0]
+            if q.target != torch.ops.circle_custom.quantize_mx_decomposed.default:
+                continue
+            q_args = q.args
+            op = q_args[0]
+
+            # Check if Q and DQ have same parameters
+            if q_args[1] != dq_args[1]:
+                continue
+            if q_args[2] != dq_args[2]:
+                continue
+
+            # ───────────────────────────────────────────
+            # Case 1: op not yet quantized
+            # ───────────────────────────────────────────
+            if QPARAM_KEY not in op.meta:
+                #TODO
+                qparam = QuantParam()
+                qparam.dtype = "mxint8"# q_args[1] #TODO
+                qparam.quantized_dimension = q_args[2]
+                op.meta[QPARAM_KEY] = qparam
+
+                dq.replace_all_uses_with(op, propagate_meta=False)
+
+                logger.debug(f"{q.name} and {dq.name} are folded to {op.name}.")
+                if QPARAM_KEY in dq.meta:
+                    if qparam_dtype(op) == "mxint8" and (qparam_dtype(dq) == "int16" or qparam_dtype(dq) == "uint8"):
+                        #need to insert requantization
+                        _insert_quantize_op(op, dq.meta[QPARAM_KEY])
 
+            # ───────────────────────────────────────────
+            # Case 2: op already quantized
+            #        2.1 same dtype  → nothing to do
+            #        2.2 diff dtype  → leave Q in place
+            # ───────────────────────────────────────────
+            else:
+                op_qparam: QuantParam = op.meta[QPARAM_KEY]
+                qdq_dtype = "mxint8"#q_args[1] #TODO
+
+                if op_qparam.dtype != qdq_dtype:
+                    # Attach QPARAM to Q once
+                    if QPARAM_KEY not in q.meta:
+                        qparam = QuantParam()
+                        qparam.dtype = qdq_dtype
+                        qparam.quantized_dimension = q_args[2]
+                        q.meta[QPARAM_KEY] = qparam
+                        assert len(q.users) == 1, "Fix me unless"
+
+                    dq.replace_all_uses_with(q, propagate_meta=False)
+                    logger.debug(f"{dq.name} is folded ({q.name} is left).")
+                else:
+                    # Same dtype → the Quantize–Dequantize pair is redundant.
+                    assert not op_qparam.scale
+                    assert not op_qparam.zero_point
+                    assert (
+                        op_qparam.dtype
+                        and op_qparam.dtype == 'mxint8' #TODO
+                    )
+                    assert (
+                        op_qparam.quantized_dimension is not None
+                        and op_qparam.quantized_dimension == q_args[2]
+                    )
+                    dq.replace_all_uses_with(op, propagate_meta=False)
+                    logger.debug(f"Removed redundant {dq.name}")
+                    
         graph.eliminate_dead_code()
         graph.lint()
         graph_module.recompile()