Samsung · mhs4670go · Mar 17, 2026 · Mar 17, 2026
diff --git a/tico/serialize/circle_graph.py b/tico/serialize/circle_graph.py
@@ -28,6 +28,7 @@
     str_to_circle_dtype,
     to_circle_dtype,
     to_circle_shape,
+    to_flat_contiguous_numpy,
 )
 from tico.serialize.pack import pack_buffer
 from tico.serialize.quant_param import QPARAM_KEY, QuantParam
@@ -160,7 +161,7 @@ def add_tensor_from_node(
 
         buffer = circle.Buffer.BufferT()
         if data is not None and isinstance(data, np.ndarray):
-            data = data.flatten()
+            data = to_flat_contiguous_numpy(data)
 
             if QPARAM_KEY in node.meta:
                 if node.meta[QPARAM_KEY].dtype == "uint4":
@@ -190,7 +191,8 @@ def add_const_tensor(
         tensor.shape, tensor.shapeSignature = to_circle_shape(torch_t_shape)
 
         buffer = circle.Buffer.BufferT()
-        buffer.data = torch_t.flatten().cpu().numpy().view(np.uint8)  # type: ignore[assignment]
+        flat_data = to_flat_contiguous_numpy(torch_t)
+        buffer.data = flat_data.view(np.uint8)  # type: ignore[assignment]
         bid = self.model.add_buffer(buffer)
         tensor.buffer = bid
         self._add_tensor(tensor)
@@ -278,8 +280,8 @@ def update_tensor_buffer(
         assert op_tensor.shape == data_shape
 
         buffer = circle.Buffer.BufferT()
-        # Packing np.ndarray is faster than packing bytes
-        buffer.data = data_tensor.flatten().cpu().numpy().view(np.uint8)  # type: ignore[assignment]
+        flat_data = to_flat_contiguous_numpy(data_tensor)
+        buffer.data = flat_data.view(np.uint8)  # type: ignore[assignment]
         bid = self.model.add_buffer(buffer)
         op_tensor.buffer = bid
 

diff --git a/tico/serialize/circle_mapping.py b/tico/serialize/circle_mapping.py
@@ -249,3 +249,41 @@ def circle_legalize_dtype_to(values, *, dtype: torch.dtype) -> torch.Tensor:
     if not check_if_i32_range(values):
         raise RuntimeError("'size' cannot be converted from int64 to int32.")
     return torch.as_tensor(values, dtype=dtype)
+
+
+def to_flat_contiguous_numpy(data) -> np.ndarray:
+    """
+    Convert input data to a 1D contiguous NumPy array.
+
+    This utility ensures that the returned array:
+    - resides on CPU
+    - has a contiguous (C-order) memory layout
+    - is flattened to shape (N,)
+
+    Behavior:
+    - If `data` is a torch.Tensor:
+        - Detaches from autograd
+        - Moves to CPU (if necessary)
+        - Ensures contiguous layout via `.contiguous()`
+        - Converts to NumPy and flattens
+    - If `data` is already array-like (e.g., np.ndarray, list):
+        - Converts to a contiguous NumPy array via `np.ascontiguousarray`
+        - Flattens to 1D
+
+    This function is designed to minimize unnecessary copies:
+    both `.contiguous()` and `np.ascontiguousarray()` only copy data
+    when the input is not already contiguous.
+
+    Parameters
+    ----------
+    data : Union[torch.Tensor, np.ndarray, array-like]
+        Input data to be converted.
+
+    Returns
+    -------
+    np.ndarray
+        A contiguous 1D NumPy array.
+    """
+    if isinstance(data, torch.Tensor):
+        return data.detach().cpu().contiguous().numpy().reshape(-1)
+    return np.ascontiguousarray(data).reshape(-1)
diff --git a/tico/serialize/pack.py b/tico/serialize/pack.py
@@ -16,20 +16,34 @@
 
 
 def pack_buffer(flat_data: np.ndarray, dtype: str) -> np.ndarray:
-    assert len(flat_data.shape) == 1
+    assert flat_data.ndim == 1
 
     if dtype == "uint4":
         if flat_data.dtype != np.uint8:
             raise RuntimeError("uint4 data should be saved in uint8.")
 
-        numel = flat_data.shape[0]
-        packed = np.zeros((numel + 1) // 2, dtype=np.uint8)
-        for i in range(numel):
-            assert flat_data[i] >= 0 and flat_data[i] <= 15
-            if i % 2 == 0:
-                packed[i // 2] = flat_data[i]
-            else:
-                packed[i // 2] |= flat_data[i] << 4
+        if flat_data.size == 0:
+            return np.empty(0, dtype=np.uint8)
+
+        if np.any(flat_data > 15):
+            raise RuntimeError("uint4 data must be in [0, 15].")
+
+        """
+        NumPy vectorized operations are faster than Python-level loops:
+
+          - flat_data[0::2] and flat_data[1::2] use strided views (no data copy)
+          - Bitwise operations (<<, |=) are executed in optimized C
+
+        As a result, packing runs in bulk over the entire array, which is significantly
+        faster than iterating element-by-element in Python.
+        """
+        packed = np.empty((flat_data.size + 1) // 2, dtype=np.uint8)
+        packed[:] = flat_data[0::2]
+        # For odd-sized inputs, the last packed element has no corresponding
+        # upper 4-bit value, so we restrict the operation to packed[: n//2]
+        # to avoid shape mismatch.
+        upper = (flat_data[1::2] << 4).astype(np.uint8, copy=False)
+        packed[: flat_data.size // 2] |= upper
         return packed
-    else:
-        raise NotImplementedError(f"NYI dtype: {dtype}")
+
+    raise NotImplementedError(f"NYI dtype: {dtype}")