Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions tico/serialize/circle_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
str_to_circle_dtype,
to_circle_dtype,
to_circle_shape,
to_flat_contiguous_numpy,
)
from tico.serialize.pack import pack_buffer
from tico.serialize.quant_param import QPARAM_KEY, QuantParam
Expand Down Expand Up @@ -160,7 +161,7 @@ def add_tensor_from_node(

buffer = circle.Buffer.BufferT()
if data is not None and isinstance(data, np.ndarray):
data = data.flatten()
data = to_flat_contiguous_numpy(data)

if QPARAM_KEY in node.meta:
if node.meta[QPARAM_KEY].dtype == "uint4":
Expand Down Expand Up @@ -190,7 +191,8 @@ def add_const_tensor(
tensor.shape, tensor.shapeSignature = to_circle_shape(torch_t_shape)

buffer = circle.Buffer.BufferT()
buffer.data = torch_t.flatten().cpu().numpy().view(np.uint8) # type: ignore[assignment]
flat_data = to_flat_contiguous_numpy(torch_t)
buffer.data = flat_data.view(np.uint8) # type: ignore[assignment]
bid = self.model.add_buffer(buffer)
tensor.buffer = bid
self._add_tensor(tensor)
Expand Down Expand Up @@ -278,8 +280,8 @@ def update_tensor_buffer(
assert op_tensor.shape == data_shape

buffer = circle.Buffer.BufferT()
# Packing np.ndarray is faster than packing bytes
buffer.data = data_tensor.flatten().cpu().numpy().view(np.uint8) # type: ignore[assignment]
flat_data = to_flat_contiguous_numpy(data_tensor)
buffer.data = flat_data.view(np.uint8) # type: ignore[assignment]
bid = self.model.add_buffer(buffer)
op_tensor.buffer = bid

Expand Down
38 changes: 38 additions & 0 deletions tico/serialize/circle_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,41 @@ def circle_legalize_dtype_to(values, *, dtype: torch.dtype) -> torch.Tensor:
if not check_if_i32_range(values):
raise RuntimeError("'size' cannot be converted from int64 to int32.")
return torch.as_tensor(values, dtype=dtype)


def to_flat_contiguous_numpy(data) -> np.ndarray:
"""
Convert input data to a 1D contiguous NumPy array.

This utility ensures that the returned array:
- resides on CPU
- has a contiguous (C-order) memory layout
- is flattened to shape (N,)

Behavior:
- If `data` is a torch.Tensor:
- Detaches from autograd
- Moves to CPU (if necessary)
- Ensures contiguous layout via `.contiguous()`
- Converts to NumPy and flattens
- If `data` is already array-like (e.g., np.ndarray, list):
- Converts to a contiguous NumPy array via `np.ascontiguousarray`
- Flattens to 1D

This function is designed to minimize unnecessary copies:
both `.contiguous()` and `np.ascontiguousarray()` only copy data
when the input is not already contiguous.

Parameters
----------
data : Union[torch.Tensor, np.ndarray, array-like]
Input data to be converted.

Returns
-------
np.ndarray
A contiguous 1D NumPy array.
"""
if isinstance(data, torch.Tensor):
return data.detach().cpu().contiguous().numpy().reshape(-1)
return np.ascontiguousarray(data).reshape(-1)
36 changes: 25 additions & 11 deletions tico/serialize/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,34 @@


def pack_buffer(flat_data: np.ndarray, dtype: str) -> np.ndarray:
assert len(flat_data.shape) == 1
assert flat_data.ndim == 1

if dtype == "uint4":
if flat_data.dtype != np.uint8:
raise RuntimeError("uint4 data should be saved in uint8.")

numel = flat_data.shape[0]
packed = np.zeros((numel + 1) // 2, dtype=np.uint8)
for i in range(numel):
assert flat_data[i] >= 0 and flat_data[i] <= 15
if i % 2 == 0:
packed[i // 2] = flat_data[i]
else:
packed[i // 2] |= flat_data[i] << 4
if flat_data.size == 0:
return np.empty(0, dtype=np.uint8)

if np.any(flat_data > 15):
raise RuntimeError("uint4 data must be in [0, 15].")

"""
NumPy vectorized operations are faster than Python-level loops:

- flat_data[0::2] and flat_data[1::2] use strided views (no data copy)
- Bitwise operations (<<, |=) are executed in optimized C

As a result, packing runs in bulk over the entire array, which is significantly
faster than iterating element-by-element in Python.
"""
packed = np.empty((flat_data.size + 1) // 2, dtype=np.uint8)
packed[:] = flat_data[0::2]
# For odd-sized inputs, the last packed element has no corresponding
# upper 4-bit value, so we restrict the operation to packed[: n//2]
# to avoid shape mismatch.
upper = (flat_data[1::2] << 4).astype(np.uint8, copy=False)
packed[: flat_data.size // 2] |= upper
return packed
else:
raise NotImplementedError(f"NYI dtype: {dtype}")

raise NotImplementedError(f"NYI dtype: {dtype}")
Loading