Samsung · glistening · Nov 21, 2025 · Nov 21, 2025
diff --git a/tools/oquantize/MANIFEST.in b/tools/oquantize/MANIFEST.in
@@ -0,0 +1 @@
+include oquantize/lib/*.so
diff --git a/tools/oquantize/README.md b/tools/oquantize/README.md
@@ -0,0 +1,46 @@
+# Circle Model Quantization with GGML
+
+This tool quantizes Circle models using the GGML library.
+
+## Prerequisites
+- `gcc` installed
+- `flatc` (FlatBuffers compiler) must be available
+- Set `FLATC_PATH` if `flatc` is not in your PATH or standard build locations
+
+## Building the Tool
+
+The tool is structured as a Python package `oquantize` located in `tools/oquantize`.
+It includes a C extension that needs to be compiled and generates `circle.py` from schema.
+
+```bash
+cd tools/oquantize
+python3 setup.py
+```
+
+This compiles `libggml_quant.so` from the GGML source files and generates `circle.py`.
+
+## Running the Tool
+
+To quantize a Circle model, run the `oquantize` package from the `tools` directory:
+
+```bash
+cd tools
+# Usage: python -m oquantize <quant_type> <input_circle> <output_circle>
+python3 -m oquantize q4_0 prefill.circle prefill.q4.circle
+python3 -m oquantize q4_0 decode.circle decode.q4.circle
+```
+
+### File Size Comparison
+
+| File | Original Size | Quantized Size | Reduction |
+|------|---------------|----------------|-----------|
+| prefill.circle | 18M | 2.7M | ~85% |
+| decode.circle | 18M | 2.7M | ~85% |
+
+(Note: significant reduction is observed due to FP32 -> Q4_0 quantization).
+
+## Implementation Details
+- **Package Structure**: `tools/oquantize/`
+- **C Extension**: `libggml_quant.so` compiled from `ggml-quants.c`, `ggml-aarch64.c`, and `ggml.c`
+- **Quantization**: Row-wise `GGML_Q4_0` quantization for `GATHER` (input 0) and `FULLY_CONNECTED` (input 1) weights
+- **Schema**: `circle.py` generated from `runtime/libs/circle-schema/circle_schema.fbs` using `flatc --python --gen-object-api --gen-onefile`
diff --git a/tools/oquantize/__init__.py b/tools/oquantize/__init__.py
diff --git a/tools/oquantize/__main__.py b/tools/oquantize/__main__.py
@@ -0,0 +1,4 @@
+from .main import main
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/oquantize/main.py b/tools/oquantize/main.py
@@ -0,0 +1,165 @@
+import ctypes
+import os
+import sys
+import numpy as np
+import flatbuffers
+
+# Add tools/o2o to sys.path to import circle
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../o2o')))
+try:
+    import circle
+except ImportError:
+    print(
+        "Error: Could not import 'circle'. Make sure tools/o2o is in PYTHONPATH or the script is run from the correct location."
+    )
+    sys.exit(1)
+
+
+def load_ggml_library():
+    lib_path = os.path.join(os.path.dirname(__file__), 'lib', 'libggml_quant.so')
+    if not os.path.exists(lib_path):
+        print(f"Error: {lib_path} not found. Please build the package first.")
+        sys.exit(1)
+
+    lib = ctypes.CDLL(lib_path)
+
+    # void quantize_row_q4_0(const float * x, void * y, int64_t k);
+    lib.quantize_row_q4_0.argtypes = [
+        ctypes.POINTER(ctypes.c_float), ctypes.c_void_p, ctypes.c_int64
+    ]
+    lib.quantize_row_q4_0.restype = None
+
+    return lib
+
+
+def quantize_tensor(lib, tensor_data):
+    # tensor_data is a numpy array of float32
+    k = tensor_data.size
+
+    if k % 32 != 0:
+        print(f"Warning: Tensor size {k} is not a multiple of 32. Skipping quantization.")
+        return None
+
+    # QK4_0 = 32
+    # block_q4_0 size = sizeof(ggml_half) + QK4_0 / 2 = 2 + 16 = 18 bytes
+    block_size = 18
+    num_blocks = k // 32
+    output_size = num_blocks * block_size
+
+    output_buffer = (ctypes.c_byte * output_size)()
+
+    # Create a pointer to the input data
+    input_ptr = tensor_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+
+    # Call the C function
+    # quantize_row_q4_0 processes the whole row (k elements)
+    lib.quantize_row_q4_0(input_ptr, output_buffer, ctypes.c_int64(k))
+
+    return bytearray(output_buffer)
+
+
+def main():
+    if len(sys.argv) != 4:
+        print("Usage: python -m oquantize <quant_type> <input_circle> <output_circle>")
+        print("Supported quant_type: q4_0")
+        sys.exit(1)
+
+    quant_type = sys.argv[1]
+    input_path = sys.argv[2]
+    output_path = sys.argv[3]
+
+    if quant_type != "q4_0":
+        print(
+            f"Error: Unsupported quantization type '{quant_type}'. Only 'q4_0' is supported."
+        )
+        sys.exit(1)
+
+    if not os.path.exists(input_path):
+        print(f"Error: Input file {input_path} does not exist.")
+        sys.exit(1)
+
+    lib = load_ggml_library()
+
+    print(f"Loading {input_path}...")
+    with open(input_path, 'rb') as f:
+        buf = f.read()
+
+    model = circle.Model.GetRootAs(buf, 0)
+    model_t = circle.ModelT.InitFromObj(model)
+
+    quantized_count = 0
+
+    for subgraph in model_t.subgraphs:
+        for op in subgraph.operators:
+            target_tensor_idx = -1
+
+            if op.opcodeIndex < len(model_t.operatorCodes):
+                op_code = model_t.operatorCodes[op.opcodeIndex]
+                builtin_code = op_code.builtinCode
+
+                if builtin_code == circle.BuiltinOperator.GATHER:
+                    # GATHER: input 0 is params (weights)
+                    if len(op.inputs) > 0:
+                        target_tensor_idx = op.inputs[0]
+                elif builtin_code == circle.BuiltinOperator.FULLY_CONNECTED:
+                    # FULLY_CONNECTED: input 1 is weights
+                    if len(op.inputs) > 1:
+                        target_tensor_idx = op.inputs[1]
+
+            if target_tensor_idx != -1:
+                tensor = subgraph.tensors[target_tensor_idx]
+
+                if tensor.type == circle.TensorType.FLOAT32:
+                    buffer_idx = tensor.buffer
+                    if buffer_idx < len(model_t.buffers):
+                        buffer_obj = model_t.buffers[buffer_idx]
+
+                        # Check if buffer has data
+                        if buffer_obj.data is not None:
+                            # Convert to numpy array
+                            # buffer_obj.data is a list of ints (bytes) or numpy array
+                            # circle.py generated code usually behaves like this:
+                            # if InitFromObj used numpy, it might be numpy.
+                            # Let's assume it's a list of uint8 or similar.
+
+                            data_bytes = bytes(buffer_obj.data)
+                            tensor_data = np.frombuffer(data_bytes, dtype=np.float32)
+
+                            print(
+                                f"Quantizing tensor {target_tensor_idx} (size={tensor_data.size})..."
+                            )
+
+                            quantized_data = quantize_tensor(lib, tensor_data)
+
+                            if quantized_data is not None:
+                                # Update buffer
+                                buffer_obj.data = list(
+                                    quantized_data
+                                )  # FlatBuffers python expects list of ints for ubyte vector?
+                                # Or numpy array? circle.py:
+                                # if np is not None and type(self.data) is np.ndarray: builder.CreateNumpyVector(self.data)
+                                # So we can set it to numpy array of uint8
+                                buffer_obj.data = np.frombuffer(quantized_data,
+                                                                dtype=np.uint8)
+
+                                # Update tensor type
+                                tensor.type = circle.TensorType.GGML_Q4_0
+                                quantized_count += 1
+
+    if quantized_count > 0:
+        print(f"Quantized {quantized_count} tensors.")
+        print(f"Saving to {output_path}...")
+
+        builder = flatbuffers.Builder(1024)
+        model_offset = model_t.Pack(builder)
+        builder.Finish(model_offset, file_identifier=b'CIR0')
+
+        with open(output_path, 'wb') as f:
+            f.write(builder.Output())
+        print("Done.")
+    else:
+        print("No tensors quantized.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/oquantize/setup.py b/tools/oquantize/setup.py
@@ -0,0 +1,128 @@
+import os
+import subprocess
+import sys
+from setuptools import setup, find_packages
+from setuptools.command.build_py import build_py
+
+import shutil
+
+
+def find_flatc():
+    # 1. Check FLATC_PATH environment variable
+    flatc_env = os.environ.get('FLATC_PATH')
+    if flatc_env and os.path.isfile(flatc_env) and os.access(flatc_env, os.X_OK):
+        return flatc_env
+
+    # 2. Check system PATH
+    flatc_path = shutil.which('flatc')
+    if flatc_path:
+        return flatc_path
+
+    # 3. Check common build locations
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    possible_paths = [
+        os.path.join(script_dir, '../../build/release/overlay/bin/flatc'),
+        os.path.join(script_dir, '../../build/debug/overlay/bin/flatc'),
+    ]
+
+    for path in possible_paths:
+        if os.path.isfile(path) and os.access(path, os.X_OK):
+            return path
+
+    return None
+
+
+def generate_circle_py():
+    flatc_path = find_flatc()
+    if not flatc_path:
+        print(
+            "Error: flatc not found. Please set FLATC_PATH environment variable or ensure flatc is in your PATH or build directory."
+        )
+        sys.exit(1)
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    schema_path = os.path.abspath(
+        os.path.join(script_dir, '../../runtime/libs/circle-schema/circle_schema.fbs'))
+
+    if not os.path.exists(schema_path):
+        print(f"Error: Schema file not found at {schema_path}")
+        sys.exit(1)
+
+    output_dir = script_dir
+
+    print(f"Generating circle.py using {flatc_path} from {schema_path}...")
+    cmd = [
+        flatc_path, '--python', '--gen-object-api', '--gen-onefile', '-o', output_dir,
+        schema_path
+    ]
+
+    try:
+        subprocess.run(cmd, check=True)
+        generated_file = os.path.join(output_dir, 'circle_schema_generated.py')
+        target_file = os.path.join(output_dir, 'circle.py')
+        os.rename(generated_file, target_file)
+        print("Successfully generated circle.py")
+    except:
+        print(f"Failed to generate circle.py")
+        sys.exit(1)
+
+
+def compile_ggml_lib():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    ggml_src_dir = os.path.abspath(
+        os.path.join(script_dir, '../../runtime/3rdparty/ggml/src'))
+    lib_dir = os.path.join(script_dir, 'lib')
+    lib_name = 'libggml_quant.so'
+    lib_path = os.path.join(lib_dir, lib_name)
+
+    if not os.path.exists(lib_dir):
+        os.makedirs(lib_dir)
+
+    print(f"Compiling {lib_name} from {ggml_src_dir}...")
+
+    cmd = [
+        'gcc', '-shared', '-fPIC', '-O3', '-o', lib_path,
+        os.path.join(ggml_src_dir, 'ggml-quants.c'),
+        os.path.join(ggml_src_dir, 'ggml-aarch64.c'),
+        os.path.join(ggml_src_dir, 'ggml.c'), '-I', ggml_src_dir, '-I',
+        os.path.abspath(os.path.join(ggml_src_dir, '../include')), '-lm'
+    ]
+
+    print("Running command:", " ".join(cmd))
+    try:
+        subprocess.check_call(cmd)
+        print(f"Successfully compiled {lib_path}")
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to compile {lib_name}: {e}")
+        sys.exit(1)
+
+
+class CustomBuildPy(build_py):
+    def run(self):
+        generate_circle_py()
+        compile_ggml_lib()
+        super().run()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        # Direct execution: python setup.py
+        # Compile in-place
+        generate_circle_py()
+        compile_ggml_lib()
+    else:
+        # Standard setuptools execution
+        setup(
+            name='oquantize',
+            version='0.1.0',
+            packages=['oquantize'],
+            package_dir={'oquantize': '.'},
+            include_package_data=True,
+            cmdclass={
+                'build_py': CustomBuildPy,
+            },
+            install_requires=[
+                'numpy',
+                'flatbuffers',
+            ],
+        )