Samsung
diff --git a/‎test/quantization/algorithm/test_gptq.py‎
Lines changed: 92 additions & 0 deletions b/‎test/quantization/algorithm/test_gptq.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎tico/quantization/algorithm/gptq/gptq.py‎
Lines changed: 153 additions & 47 deletions b/‎tico/quantization/algorithm/gptq/gptq.py‎
Lines changed: 153 additions & 47 deletions
diff --git a/‎tico/quantization/algorithm/gptq/quantizer.py‎
Lines changed: 11 additions & 5 deletions b/‎tico/quantization/algorithm/gptq/quantizer.py‎
Lines changed: 11 additions & 5 deletions
@@ -150,6 +150,36 @@ def forward(self, x):
     def get_example_inputs(self):
         return (torch.randn(1, 16, 7, 7),), {}
 
+class NormConv3D(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.m = torch.nn.ModuleList()
+        self.m.append(torch.nn.Conv3d(16, 8, (2, 3, 5), stride=1))
+        self.m.append(torch.nn.Conv3d(8, 32, (3, 5, 2), stride=2))
+
+    def forward(self, x):
+        z = self.m[0](x)
+        z = self.m[1](z)
+        return z
+
+    def get_example_inputs(self):
+        return (torch.randn(5, 16, 17, 19, 35),), {}
+
+    def get_zero_inputs(self):
+        return (torch.zeros(5, 16, 17, 19, 35),), {}
+
+class PaddedNormConv3D(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.m = torch.nn.ModuleList()
+        self.m.append(torch.nn.Conv3d(16, 8, (2, 3, 5), stride=1, padding="valid"))
+
+    def forward(self, x):
+        z = self.m[0](x)
+        return z
+
+    def get_example_inputs(self):
+        return (torch.randn(5, 16, 17, 19, 35),), {}
 
 class GPTQTest(unittest.TestCase):
     @unittest.skipIf(
@@ -430,3 +460,65 @@ def test_transposed_conv2d(self):
         ), "second conv node is not quantized"
 
         # TODO add quantization
+        
+    @unittest.skipIf(
+        not IS_INTERNAL_TEST, "Internal test — run only if --include-internal is set"
+    )
+    def test_normconv3d(self):
+        q_m = NormConv3D()
+        q_m.eval()
+        ori_m = q_m
+        args, kwargs = ori_m.get_example_inputs()
+
+        # Apply GPTQ
+        q_m = prepare(q_m, GPTQConfig(show_progress=False))
+        for _ in range(30):
+            args, kwargs = ori_m.get_example_inputs()
+            q_m(*args, **kwargs)
+        convert(q_m, inplace=True)
+        # check that all convolution nodes are quantized
+        assert hasattr(q_m, "quantizers"), "quantized model does not have quantizers"
+        assert (
+            "model.layers.0.m.0" in q_m.quantizers  # type: ignore[operator]
+        ), "first conv node is not quantized"
+        assert (
+            "model.layers.0.m.1" in q_m.quantizers  # type: ignore[operator]
+        ), "second conv node is not quantized"
+
+    @unittest.skipIf(
+        not IS_INTERNAL_TEST, "Internal test — run only if --include-internal is set"
+    )
+    def test_normconv3d_on_zero_inputs(self):
+        q_m = NormConv3D()
+        q_m.eval()
+        ori_m = q_m
+
+        # Apply GPTQ
+        q_m = prepare(q_m, GPTQConfig(show_progress=False))
+        for _ in range(30):
+            args, kwargs = ori_m.get_zero_inputs()
+            q_m(*args, **kwargs)
+        convert(q_m, inplace=True)
+        assert torch.sum(q_m.m[0].weight != 0) > 0, "weights should not be all zeros"  # type: ignore[arg-type]
+
+
+    @unittest.skipIf(
+        not IS_INTERNAL_TEST, "Internal test — run only if --include-internal is set"
+    )
+    def test_paddednormconv3d(self):
+        q_m = PaddedNormConv3D()
+        q_m.eval()
+        ori_m = q_m
+        args, kwargs = ori_m.get_example_inputs()
+
+        # Apply GPTQ
+        q_m = prepare(q_m, GPTQConfig(show_progress=False))
+        for _ in range(30):
+            args, kwargs = ori_m.get_example_inputs()
+            q_m(*args, **kwargs)
+        convert(q_m, inplace=True)
+        # check that all convolution nodes are quantized
+        assert hasattr(q_m, "quantizers"), "quantized model does not have quantizers"
+        assert (
+            "model.layers.0.m.0" in q_m.quantizers  # type: ignore[operator]
+        ), "first conv node is not quantized"
@@ -24,6 +24,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
 from tico.quantization.algorithm.gptq.utils import get_numerical_padding
@@ -167,7 +168,11 @@ def __init__(self, layer):
         self.layer = layer
         self.dev = self.layer.weight.device
         W = layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
+        if (
+            isinstance(self.layer, nn.Conv2d)
+            or isinstance(self.layer, nn.Conv1d)
+            or isinstance(self.layer, nn.Conv3d)
+        ):
             W = W.flatten(1)  # reshaped to matrix (OUT_channels x the_rest)
         elif isinstance(self.layer, nn.ConvTranspose2d):
             W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
@@ -251,10 +256,87 @@ def add_batch(self, inp, out):
         if isinstance(self.layer, nn.ConvTranspose2d):
             inp = get_matmul_input_for_convtranspose2d(self.layer, inp)
 
+        if isinstance(self.layer, nn.Conv3d):
+            # adapted from https://discuss.pytorch.org/t/manual-implementation-of-unrolled-3d-convolutions/91021
+            assert (
+                self.layer.groups == 1
+            )  # depthwise/groupwise are not supported currently
+            assert all(dilation == 1 for dilation in self.layer.dilation)
+
+            # test
+            #  input_dim = [22, 59, 114]
+            #  in_channels = 10
+            #  out_channels = 5
+            #  kernel_size = (4, 2, 3)
+            #  padding = (1, 4, 3)
+            #  stride = (1, 1, 1)
+            #  N = 51
+            #  input_tensor = torch.zeros(N, in_channels, input_dim[0], input_dim[1], input_dim[2]).uniform_(-1, 1)
+            #  conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride, bias=False)
+            #  output_tensor = conv(input_tensor)
+            #  output_dim = [0, 0, 0]
+            #  output_dim[0] = int((input_tensor.shape[2] - kernel_size[0] + 2 * padding[0]) / stride[0]) + 1
+            #  output_dim[1] = int((input_tensor.shape[3] - kernel_size[1] + 2 * padding[1]) / stride[1]) + 1
+            #  output_dim[2] = int((input_tensor.shape[4] - kernel_size[2] + 2 * padding[2]) / stride[2]) + 1
+            #  if not all(item == 0 for item in padding):
+            #      input_tensor = F.pad(input_tensor, pad=(padding[2], padding[2], padding[1], padding[1], padding[0], padding[0]), mode="constant", value=0)
+            #
+            #  unfolded_input_tensor = input_tensor.unfold(2, kernel_size[0], stride[0]).unfold(3, kernel_size[1], stride[1]).unfold(4, kernel_size[2], stride[2])
+            #  unfolded_input_tensor = unfolded_input_tensor.reshape(N, in_channels, -1, kernel_size[0] * kernel_size[1] * kernel_size[2])
+            #  unfolded_input_tensor = unfolded_input_tensor.permute([0, 2, 1, 3])
+            #  #unfolded_input_tensor = unfolded_input_tensor.reshape(-1, unfolded_input_tensor.shape[2] *  unfolded_input_tensor.shape[3])
+            #  #unfolded_input_tensor = unfolded_input_tensor.reshape( unfolded_input_tensor.shape[0],  unfolded_input_tensor.shape[1], unfolded_input_tensor.shape[2] *  unfolded_input_tensor.shape[3])
+            #  #unfolded_input_tensor = unfolded_input_tensor.permute([2, 0, 1])
+            #  #unfolded_input_tensor = unfolded_input_tensor.flatten(1).T #(N * NPatches, inner_dim)
+            #  unfolded_input_tensor = unfolded_input_tensor.reshape(unfolded_input_tensor.shape[0] *  unfolded_input_tensor.shape[1], unfolded_input_tensor.shape[2] *  unfolded_input_tensor.shape[3])
+            #
+            #  kernels_flat = conv.weight.detach().clone().flatten(1)#view(out_channels, -1)
+            #  alt_output_tensor = torch.matmul(kernels_flat, unfolded_input_tensor.T) #(out_channels, N * NPatches)
+            #  alt_output_tensor = alt_output_tensor.view(out_channels, N, output_dim[0], output_dim[1], output_dim[2])
+            #  alt_output_tensor = alt_output_tensor.permute([1, 0, 2, 3, 4])
+            #  eps_max = torch.max(torch.abs(output_tensor - alt_output_tensor))
+            #  eps_mean = torch.mean(torch.abs(output_tensor - alt_output_tensor))
+            #  assert( eps_max < 1.e-04 or eps_mean < 1.e-06)
+
+            # inp is assumed to be (N, C_in, H, W, D)
+            padding = get_numerical_padding(self.layer)
+            if isinstance(padding, int):
+                padding = (padding, padding, padding)
+            if not all(item == 0 for item in padding):
+                inp = F.pad(
+                    inp,
+                    pad=(
+                        padding[2],
+                        padding[2],
+                        padding[1],
+                        padding[1],
+                        padding[0],
+                        padding[0],
+                    ),
+                    mode="constant",
+                    value=0,
+                )
+            krn_size = self.layer.kernel_size
+            stride = self.layer.stride
+            inp = (
+                inp.unfold(2, krn_size[0], stride[0])
+                .unfold(3, krn_size[1], stride[1])
+                .unfold(4, krn_size[2], stride[2])
+            )  # inp.shape = (N, C_in, ..patches... , krn_size[0], krn_size[1], krn_size[2])
+            inp = inp.reshape(
+                inp.shape[0], inp.shape[1], -1, krn_size[0] * krn_size[1] * krn_size[2]
+            )  # inp.shape = (N, C_in, num_patches, krn_size[0] * krn_size[1] * krn_size[2])
+            inp = inp.permute(
+                [0, 2, 1, 3]
+            )  # inp.shape = (N, num_patches, C_in, krn_size[0] * krn_size[1] * krn_size[2])
+            inp = inp.reshape(
+                inp.shape[0] * inp.shape[1], inp.shape[2] * inp.shape[3]
+            ).T  # inp.shape =(C_in * krn_size[0] * krn_size[1] * krn_size[2], N * num_patches)
+
         self.H *= self.nsamples / (self.nsamples + tmp)
         self.nsamples += tmp
         inp = math.sqrt(2 / self.nsamples) * inp.float()
-        self.H += inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t()).to(self.H.device)
 
     def fasterquant(
         self,
@@ -266,12 +348,23 @@ def fasterquant(
         verbose=False,
     ):
         W = self.layer.weight.data.clone()
-        if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
+        if (
+            isinstance(self.layer, nn.Conv2d)
+            or isinstance(self.layer, nn.Conv1d)
+            or isinstance(self.layer, nn.Conv3d)
+        ):
             W = W.flatten(1)  # reshaped to matrix (OUT_channels x the_rest)
+            if self.quantizer.sensitivity is not None:
+                self.quantizer.sensitivity = self.quantizer.sensitivity.flatten(1)
         elif isinstance(self.layer, nn.ConvTranspose2d):
             W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
             conv2d_shape = W.shape
             W = W.flatten(1)  # reshaped to matrix (OUT_channels x the_rest)
+            if self.quantizer.sensitivity is not None:
+                self.quantizer.sensitivity = convtranspose2d_weights_to_conv2d_weights(
+                    self.layer, self.quantizer.sensitivity
+                )
+                self.quantizer.sensitivity = self.quantizer.sensitivity.flatten(1)
 
         W = W.float()
         tick = time.time()
@@ -313,49 +406,58 @@ def fasterquant(
         Hinv = H
 
         assert isinstance(Hinv, torch.Tensor)
-        for i1 in range(0, self.columns, blocksize):
-            i2 = min(i1 + blocksize, self.columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                if groupsize != -1:
-                    if not static_groups:
-                        if (i1 + i) % groupsize == 0:
-                            self.quantizer.find_params(
-                                W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
-                            )
-                    else:
-                        idx: torch.Tensor | int = i1 + i
-                        if actorder:
-                            idx = perm[idx]
-                        self.quantizer = groups[idx // groupsize]
-
-                q = quantize(
-                    w.unsqueeze(1),
-                    self.quantizer.scale,
-                    self.quantizer.zero,
-                    self.quantizer.maxq,
-                ).flatten()
-                Q1[:, i] = q
-                Losses1[:, i] = (w - q) ** 2 / d**2
-
-                err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                Err1[:, i] = err1
-
-            Q[:, i1:i2] = Q1
-            Losses[:, i1:i2] = Losses1 / 2
-
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+        just_quantize = False
+        if just_quantize:
+            Q = quantize(
+                W,
+                self.quantizer.scale,
+                self.quantizer.zero,
+                self.quantizer.maxq,
+            )
+        else:
+            for i1 in range(0, self.columns, blocksize):
+                i2 = min(i1 + blocksize, self.columns)
+                count = i2 - i1
+
+                W1 = W[:, i1:i2].clone()
+                Q1 = torch.zeros_like(W1)
+                Err1 = torch.zeros_like(W1)
+                Losses1 = torch.zeros_like(W1)
+                Hinv1 = Hinv[i1:i2, i1:i2]
+
+                for i in range(count):
+                    w = W1[:, i]
+                    d = Hinv1[i, i]
+
+                    if groupsize != -1:
+                        if not static_groups:
+                            if (i1 + i) % groupsize == 0:
+                                self.quantizer.find_params(
+                                    W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
+                                )
+                        else:
+                            idx: torch.Tensor | int = i1 + i
+                            if actorder:
+                                idx = perm[idx]
+                            self.quantizer = groups[idx // groupsize]
+
+                    q = quantize(
+                        w.unsqueeze(1),
+                        self.quantizer.scale,
+                        self.quantizer.zero,
+                        self.quantizer.maxq,
+                    ).flatten()
+                    Q1[:, i] = q
+                    Losses1[:, i] = (w - q) ** 2 / d**2
+
+                    err1 = (w - q) / d
+                    W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                    Err1[:, i] = err1
+
+                Q[:, i1:i2] = Q1
+                Losses[:, i1:i2] = Losses1 / 2
+
+                W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
 
         if torch.cuda.is_available():
             torch.cuda.synchronize()
@@ -366,7 +468,11 @@ def fasterquant(
         if actorder:
             Q = Q[:, invperm]
 
-        if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
+        if (
+            isinstance(self.layer, nn.Conv2d)
+            or isinstance(self.layer, nn.Conv1d)
+            or isinstance(self.layer, nn.Conv3d)
+        ):
             if groupsize == -1:  # TODO support groupsize != -1
                 Q[:, dead] = quantize(
                     self.layer.weight.flatten(1)[:, dead],
 
@@ -110,9 +110,10 @@ def forward(layer, *args, **kwargs):
             ):
                 self._first_layer_ref = model.model.layers[0]
             else:
-                raise RuntimeError(
-                    "GPTQ Quantizer assumes the model has a nested structure like `model.model.layers`, commonly found in LLaMA and other Hugging Face transformer models."
-                )
+                self._first_layer_ref = model  # let's treat it as a single layer
+                # raise RuntimeError(
+                #    "GPTQ Quantizer assumes the model has a nested structure like `model.model.layers`, commonly found in LLaMA and other Hugging Face transformer models."
+                # )
         else:
             # fallback if the model is not LLaMA-like; treat whole model as single layer
             self._first_layer_ref = model
@@ -180,7 +181,10 @@ def convert(self, model):
 
         # Identify layers
         if hasattr(model, "model"):
-            target_layers = model.model.layers
+            if hasattr(model.model, "layers"):
+                target_layers = model.model.layers
+            else:
+                target_layers = [model]
         else:
             target_layers = [model]
 
@@ -204,6 +208,7 @@ def convert(self, model):
                     torch.nn.Linear,
                     torch.nn.Conv2d,
                     torch.nn.Conv1d,
+                    torch.nn.Conv3d,
                     torch.nn.ConvTranspose2d,
                 ],
             )
@@ -300,7 +305,8 @@ def _hook(_, inp, out):
                 # This line ensures we always take the first element when it's a tuple.
                 outs = outs[0] if isinstance(outs, tuple) else outs
                 # Update inputs for next iteration.
-                self.cache_args[0][batch_idx] = outs
+                if len(self.cache_args) > 0:
+                    self.cache_args[0][batch_idx] = outs
 
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()