Samsung
diff --git a/‎tico/passes/decompose_fake_quantize.py‎
Lines changed: 21 additions & 0 deletions b/‎tico/passes/decompose_fake_quantize.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tico/quantization/algorithm/gptq/quantizer.py‎
Lines changed: 1 addition & 1 deletion b/‎tico/quantization/algorithm/gptq/quantizer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tico/quantization/passes/fold_quant_ops.py‎
Lines changed: 71 additions & 0 deletions b/‎tico/quantization/passes/fold_quant_ops.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎tico/quantization/wrapq/examples/quantize_linear.py‎
Lines changed: 12 additions & 1 deletion b/‎tico/quantization/wrapq/examples/quantize_linear.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎tico/quantization/wrapq/examples/quantize_llama_attn.py‎
Lines changed: 22 additions & 1 deletion b/‎tico/quantization/wrapq/examples/quantize_llama_attn.py‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎tico/quantization/wrapq/examples/quantize_llama_mlp.py‎
Lines changed: 17 additions & 1 deletion b/‎tico/quantization/wrapq/examples/quantize_llama_mlp.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎tico/quantization/wrapq/examples/quantize_llama_whole_decoder_layer.py‎
Lines changed: 169 additions & 0 deletions b/‎tico/quantization/wrapq/examples/quantize_llama_whole_decoder_layer.py‎
Lines changed: 169 additions & 0 deletions
@@ -123,6 +123,27 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     node.replace_all_uses_with(dequnt, propagate_meta=True)
                 modified = True
+                
+            if node.target in [torch.ops.circle_custom.quantize_mx.default]:
+                # tensor, scale, zero_p, quant_min, quant_max
+                assert len(node.args) == 3
+                _, elem_format, axis = node.args
+
+                with gm.graph.inserting_before(node):
+                    quant = create_node(
+                        g,
+                        torch.ops.circle_custom.quantize_float_to_mx.default,
+                        args=node.args,
+                        origin=node,
+                    )
+                    dequnt = create_node(
+                        g,
+                        torch.ops.circle_custom.dequantize_mx_to_float.default,
+                        args=(quant, *quant.args[1:]),
+                        kwargs=quant.kwargs,
+                    )
+                    node.replace_all_uses_with(dequnt, propagate_meta=True)
+                modified = True
 
         gm.graph.eliminate_dead_code()
         gm.graph.lint()
 
@@ -212,7 +212,7 @@ def convert(self, model):
                 for name in subset:
                     gptq[name] = GPTQ(subset[name])
                     gptq[name].quantizer.configure(
-                        bits=8, perchannel=True, sym=False, mse=False
+                        bits=4, perchannel=True, sym=False, mse=False
                     )
 
                 # Hook to collect (inp, out) for GPTQ
 
@@ -144,7 +144,78 @@ def call(self, exported_program: ExportedProgram) -> PassResult:
                     )
                     dq.replace_all_uses_with(op, propagate_meta=False)
                     logger.debug(f"Removed redundant {dq.name}")
+            
+        for dq in graph.nodes:
+            if dq.op != "call_function":
+                continue
+            if (
+                dq.target
+                != torch.ops.circle_custom.dequantize_mx_to_float.default
+            ):
+                continue
+            
+            dq_args = dq.args
+
+            q = dq_args[0]
+            if q.target != torch.ops.circle_custom.quantize_float_to_mx.default:
+                continue
+            q_args = q.args
+            op = q_args[0]
+
+            # Check if Q and DQ have same parameters
+            if q_args[1] != dq_args[1]:
+                continue
+            if q_args[2] != dq_args[2]:
+                continue
+
+            # ───────────────────────────────────────────
+            # Case 1: op not yet quantized
+            # ───────────────────────────────────────────
+            if QPARAM_KEY not in op.meta:
+                #TODO
+                qparam = QuantParam()
+                qparam.dtype = "mxint8"# q_args[1] #TODO
+                qparam.quantized_dimension = q_args[2]
+                op.meta[QPARAM_KEY] = qparam
+
+                dq.replace_all_uses_with(op, propagate_meta=False)
+
+                logger.debug(f"{q.name} and {dq.name} are folded to {op.name}.")
+            # ───────────────────────────────────────────
+            # Case 2: op already quantized
+            #        2.1 same dtype  → nothing to do
+            #        2.2 diff dtype  → leave Q in place
+            # ───────────────────────────────────────────
+            else:
+                op_qparam: QuantParam = op.meta[QPARAM_KEY]
+                qdq_dtype = "mxint8"#q_args[1] #TODO
+
+                if op_qparam.dtype != qdq_dtype:
+                    # Attach QPARAM to Q once
+                    if QPARAM_KEY not in q.meta:
+                        qparam = QuantParam()
+                        qparam.dtype = qdq_dtype
+                        qparam.quantized_dimension = q_args[2]
+                        q.meta[QPARAM_KEY] = qparam
+                        assert len(q.users) == 1, "Fix me unless"
 
+                    dq.replace_all_uses_with(q, propagate_meta=False)
+                    logger.debug(f"{dq.name} is folded ({q.name} is left).")
+                else:
+                    # Same dtype → the Quantize–Dequantize pair is redundant.
+                    assert not op_qparam.scale
+                    assert not op_qparam.zero_point
+                    assert (
+                        op_qparam.dtype
+                        and op_qparam.dtype == 'mxint8' #TODO
+                    )
+                    assert (
+                        op_qparam.quantized_dimension is not None
+                        and op_qparam.quantized_dimension == q_args[2]
+                    )
+                    dq.replace_all_uses_with(op, propagate_meta=False)
+                    logger.debug(f"Removed redundant {dq.name}")
+                    
         graph.eliminate_dead_code()
         graph.lint()
         graph_module.recompile()
 
@@ -36,6 +36,10 @@
 from tico.quantization.wrapq.mode import Mode
 from tico.quantization.wrapq.wrappers.nn.quant_linear import QuantLinear
 from tico.utils.utils import SuppressWarning
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
 
 
 # -------------------------------------------------------------------------
@@ -62,7 +66,14 @@ def forward(self, x):
 # -------------------------------------------------------------------------
 # 1. Replace the Linear with QuantLinear wrapper
 # -------------------------------------------------------------------------
-model.fc = prepare(fp32_layer, PTQConfig())  # type: ignore[assignment]
+cfg = PTQConfig(
+        default_dtype=DType.uint(8),
+        default_qscheme=QScheme.PER_TENSOR_ASYMM,
+        default_observer=MXObserver,#MinMaxObserver,
+        overrides = {"weight": {"dtype": DType.uint(4), "observer": MinMaxObserver}},
+)
+        
+model.fc = prepare(fp32_layer,cfg)  # type: ignore[assignment]
 qlayer = model.fc  # alias for brevity
 
 # -------------------------------------------------------------------------
 
@@ -24,6 +24,11 @@
 from tico.quantization.wrapq.mode import Mode
 from tico.quantization.wrapq.wrappers.llama.quant_attn import QuantLlamaAttention
 from tico.utils.utils import SuppressWarning
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
 
 name = "Maykeye/TinyLLama-v0"
 model = AutoModelForCausalLM.from_pretrained(name)
@@ -33,7 +38,23 @@
 # 1. Replace layer-0’s MLP with QuantLlamaMLP
 # -------------------------------------------------------------------------
 orig_attn = model.model.layers[0].self_attn
-model.model.layers[0].self_attn = prepare(orig_attn, PTQConfig())
+cfg = PTQConfig(
+    default_dtype=DType.int(16),#DType.uint(8),
+    default_qscheme=QScheme.PER_TENSOR_SYMM,#QScheme.PER_TENSOR_ASYMM,
+    default_observer=MXObserver,#MinMaxObserver,
+    overrides={
+            "q_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}, 
+                       #"act_out": {"dtype": DType.int(16),  "observer":MinMaxObserver}
+                       },
+            "k_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "v_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "o_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "scale": {"observer":MinMaxObserver},
+            #"softmax": {"observer":MinMaxObserver},
+        },
+)
+
+model.model.layers[0].self_attn = prepare(orig_attn, cfg)
 model.eval()
 
 attn_q = model.model.layers[0].self_attn  # quant wrapper
 
@@ -27,6 +27,11 @@
 from tico.quantization.wrapq.qscheme import QScheme
 from tico.quantization.wrapq.wrappers.llama.quant_mlp import QuantLlamaMLP
 from tico.utils.utils import SuppressWarning
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
 
 name = "Maykeye/TinyLLama-v0"
 model = AutoModelForCausalLM.from_pretrained(name)
@@ -37,8 +42,19 @@
 # 1. Replace layer-0’s MLP with QuantLlamaMLP
 # -------------------------------------------------------------------------
 fp32_mlp = model.model.layers[0].mlp
+cfg = PTQConfig(
+    default_dtype=DType.int(16),
+    default_qscheme=QScheme.PER_TENSOR_SYMM,
+    default_observer=MXObserver, #MinMaxObserver,
+    overrides={
+        "gate_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+        "up_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+        "down_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+    }
+)
+    
 model.model.layers[0].mlp = prepare(
-    fp32_mlp, PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
+    fp32_mlp, cfg#PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
 )
 model.eval()
 
 
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =============================================================================
+# POST-TRAINING QUANTIZATION EXAMPLE — Llama Decoder Layer (Self-Attn + MLP)
+# -----------------------------------------------------------------------------
+# This demo shows how to:
+#   1. Replace a single FP32 `LlamaDecoderLayer` with `QuantLlamaDecoderLayer`.
+#   2. Collect activation statistics in one calibration sweep.
+#   3. Freeze scales / zero-points and switch to INT-simulation mode.
+#   4. Compare INT-8 vs FP32 outputs with a quick mean-absolute-diff check.
+#   5. Export the calibrated, quantized block to a Circle model.
+# -----------------------------------------------------------------------------
+# Style / layout is kept identical to the `quantize_llama_attn.py` and
+# `quantize_llama_mlp.py` examples for easy side-by-side reading.
+# =============================================================================
+
+import pathlib
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from tico.quantization import convert, prepare
+from tico.quantization.config.ptq import PTQConfig
+from tico.quantization.evaluation.metric import compute_peir
+from tico.quantization.evaluation.utils import plot_two_outputs
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.observers.minmax import MinMaxObserver
+from tico.quantization.wrapq.observers.mx import MXObserver
+from tico.quantization.wrapq.qscheme import QScheme
+from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer import (
+    QuantLlamaDecoderLayer,
+)
+from tico.utils.utils import SuppressWarning
+
+MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct" #"Maykeye/TinyLLama-v0"   # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME, cache_dir="/mnt/storage/transformers_cache"
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_NAME, cache_dir="/mnt/storage/transformers_cache"
+)
+model.config.max_position_embeddings = 2048  # we need this to prevent RAM exhaust
+model.config.use_cache = False
+
+model.eval()  # disable dropout, etc.
+rotary = model.model.rotary_emb  # RoPE helper
+
+# -------------------------------------------------------------------------
+# 1. Swap in the quant wrapper
+# -------------------------------------------------------------------------
+fp32_layer = model.model.layers[0]  # keep a reference for diff check
+cfg = PTQConfig(
+    default_dtype=DType.int(16),
+    default_qscheme=QScheme.PER_TENSOR_SYMM,
+    default_observer=MXObserver,#MinMaxObserver,
+    overrides={
+        "mlp": {
+            "gate_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "up_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "down_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+        },
+        "self_attn": {
+            "q_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "k_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "v_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "o_proj": {"weight": {"dtype": DType.uint(4), "observer":MinMaxObserver}},
+            "scale": {"observer":MinMaxObserver},
+            #"softmax": {"observer":MinMaxObserver},
+        },
+       "input_layernorm" : {"weight": {"dtype": DType.int(16), "observer":MinMaxObserver}, 
+                            #"act_in":{"observer":MinMaxObserver}, 
+                            # "act_out":{"observer":MinMaxObserver}
+                             },
+       "post_attention_layernorm" : {"weight": {"dtype": DType.int(16), "observer":MinMaxObserver},
+                                    # "act_in":{"observer":MinMaxObserver},
+                                    # "act_out":{"observer":MinMaxObserver}
+                                      },
+    },
+)
+
+model.model.layers[0] = prepare(fp32_layer, cfg)
+model.eval()
+
+qlayer = model.model.layers[0]  # alias for brevity
+assert isinstance(qlayer.wrapped, QuantLlamaDecoderLayer)
+
+# -------------------------------------------------------------------------
+# 2. Single-pass calibration (gather activation ranges)
+# -------------------------------------------------------------------------
+PROMPTS = [
+    "The quick brown fox jumps over the lazy dog.",
+    "In 2025, AI systems accelerated hardware-software co-design at scale.",
+    "양자화는 왜 어려울까? 분포, 길이, 마스크가 관건이다.",
+    "今日はいい天気ですね。ところでRoPE角度は長さに依存します。",
+    "def quicksort(arr):\n    if len(arr) <= 1: return arr\n    ...",
+    "Prices rose 3.14% — see Figure 2; emails: foo@bar.com!",
+]
+
+with torch.no_grad():
+    for prompt in PROMPTS:
+        ids = tokenizer(prompt, return_tensors="pt")
+        hidden = model.model.embed_tokens(ids["input_ids"])
+        pos = rotary(hidden, ids["input_ids"])  # (cos, sin) tuple
+        S = pos[0].shape[1]
+        attn_mask = torch.zeros(1, 1, S, S)  # causal-mask placeholder
+        _ = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
+
+convert(qlayer)
+
+assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
+
+# -------------------------------------------------------------------------
+# 3. Quick INT-sim vs FP32 sanity check
+# -------------------------------------------------------------------------
+ids = tokenizer("check", return_tensors="pt")
+hidden = model.model.embed_tokens(ids["input_ids"])
+pos = rotary(hidden, ids["input_ids"])
+S = pos[0].shape[1]
+attn_mask = torch.zeros(1, 1, S, S)
+
+with torch.no_grad():
+    int8_out = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
+    int8 = int8_out[0] if isinstance(int8_out, tuple) else int8_out
+    fp32_out = fp32_layer(hidden, attention_mask=attn_mask, position_embeddings=pos)
+    fp32 = fp32_out[0] if isinstance(fp32_out, tuple) else fp32_out
+
+print("┌───────────── Quantization Error Summary ─────────────")
+print(f"│ Mean |diff|: {(int8 - fp32).abs().mean().item():.6f}")
+print(f"│ PEIR       : {compute_peir(fp32, int8) * 100:.6f} %")
+print("└──────────────────────────────────────────────────────")
+print(plot_two_outputs(fp32, int8))
+
+# -------------------------------------------------------------------------
+# 4. Export the calibrated layer to Circle
+# -------------------------------------------------------------------------
+import tico
+
+save_path = pathlib.Path(
+    "decoder_layer.q.circle"
+)  # "decoder_layer_unsloth_LLama_3_2_1B_RMS_NORM_A16W4.q.circle"
+B, S, D = 1, 4, model.config.hidden_size
+example_hidden = torch.randn(B, S, D)
+example_pos = rotary(example_hidden, torch.arange(S)[None, :])
+attn_mask = torch.zeros(1, 1, S, S)
+
+with SuppressWarning(UserWarning, ".*"):
+    cm = tico.convert(
+        qlayer,
+        (example_hidden, attn_mask),
+        {"position_embeddings": example_pos},
+        strict=False,
+    )
+# Note that the model is not fully quantized.
+cm.save(save_path)
+
+print(f"Quantized Circle model saved to {save_path.resolve()}")
Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ def convert(self, model):`
`212`	`212`	`for name in subset:`
`213`	`213`	`gptq[name] = GPTQ(subset[name])`
`214`	`214`	`gptq[name].quantizer.configure(`
`215`		`- bits=8, perchannel=True, sym=False, mse=False`
	`215`	`+ bits=4, perchannel=True, sym=False, mse=False`
`216`	`216`	`)`
`217`	`217`
`218`	`218`	`# Hook to collect (inp, out) for GPTQ`