From ef648d9b82267c59567b92cc897aaf1977f12537 Mon Sep 17 00:00:00 2001 From: Sanggyu Lee Date: Fri, 21 Nov 2025 10:20:38 +0900 Subject: [PATCH 1/2] [tools/oquantize] Introduce oquantize oquantize (where o means circle) aims to quantize weights in circle. It supports ggml q4_0. ONE-DCO-1.0-Signed-off-by: Sanggyu Lee --- tools/oquantize/MANIFEST.in | 1 + tools/oquantize/README.md | 46 +++++++++++ tools/oquantize/__init__.py | 0 tools/oquantize/__main__.py | 4 + tools/oquantize/main.py | 154 ++++++++++++++++++++++++++++++++++++ tools/oquantize/setup.py | 129 ++++++++++++++++++++++++++++++ 6 files changed, 334 insertions(+) create mode 100644 tools/oquantize/MANIFEST.in create mode 100644 tools/oquantize/README.md create mode 100644 tools/oquantize/__init__.py create mode 100644 tools/oquantize/__main__.py create mode 100644 tools/oquantize/main.py create mode 100644 tools/oquantize/setup.py diff --git a/tools/oquantize/MANIFEST.in b/tools/oquantize/MANIFEST.in new file mode 100644 index 00000000000..1d39f83b4f3 --- /dev/null +++ b/tools/oquantize/MANIFEST.in @@ -0,0 +1 @@ +include oquantize/lib/*.so diff --git a/tools/oquantize/README.md b/tools/oquantize/README.md new file mode 100644 index 00000000000..419675cca52 --- /dev/null +++ b/tools/oquantize/README.md @@ -0,0 +1,46 @@ +# Circle Model Quantization with GGML + +This tool quantizes Circle models using the GGML library. + +## Prerequisites +- `gcc` installed +- `flatc` (FlatBuffers compiler) must be available +- Set `FLATC_PATH` if `flatc` is not in your PATH or standard build locations + +## Building the Tool + +The tool is structured as a Python package `oquantize` located in `tools/oquantize`. +It includes a C extension that needs to be compiled and generates `circle.py` from schema. + +```bash +cd tools/oquantize +python3 setup.py +``` + +This compiles `libggml_quant.so` from the GGML source files and generates `circle.py`. + +## Running the Tool + +To quantize a Circle model, run the `oquantize` package from the `tools` directory: + +```bash +cd tools +# Usage: python -m oquantize +python3 -m oquantize q4_0 prefill.circle prefill.q4.circle +python3 -m oquantize q4_0 decode.circle decode.q4.circle +``` + +### File Size Comparison + +| File | Original Size | Quantized Size | Reduction | +|------|---------------|----------------|-----------| +| prefill.circle | 18M | 2.7M | ~85% | +| decode.circle | 18M | 2.7M | ~85% | + +(Note: significant reduction is observed due to FP32 -> Q4_0 quantization). + +## Implementation Details +- **Package Structure**: `tools/oquantize/` +- **C Extension**: `libggml_quant.so` compiled from `ggml-quants.c`, `ggml-aarch64.c`, and `ggml.c` +- **Quantization**: Row-wise `GGML_Q4_0` quantization for `GATHER` (input 0) and `FULLY_CONNECTED` (input 1) weights +- **Schema**: `circle.py` generated from `runtime/libs/circle-schema/circle_schema.fbs` using `flatc --python --gen-object-api --gen-onefile` diff --git a/tools/oquantize/__init__.py b/tools/oquantize/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/oquantize/__main__.py b/tools/oquantize/__main__.py new file mode 100644 index 00000000000..40e2b013f61 --- /dev/null +++ b/tools/oquantize/__main__.py @@ -0,0 +1,4 @@ +from .main import main + +if __name__ == "__main__": + main() diff --git a/tools/oquantize/main.py b/tools/oquantize/main.py new file mode 100644 index 00000000000..2f936049b7f --- /dev/null +++ b/tools/oquantize/main.py @@ -0,0 +1,154 @@ +import ctypes +import os +import sys +import numpy as np +import flatbuffers + +# Add tools/o2o to sys.path to import circle +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../o2o'))) +try: + import circle +except ImportError: + print("Error: Could not import 'circle'. Make sure tools/o2o is in PYTHONPATH or the script is run from the correct location.") + sys.exit(1) + +def load_ggml_library(): + lib_path = os.path.join(os.path.dirname(__file__), 'lib', 'libggml_quant.so') + if not os.path.exists(lib_path): + print(f"Error: {lib_path} not found. Please build the package first.") + sys.exit(1) + + lib = ctypes.CDLL(lib_path) + + # void quantize_row_q4_0(const float * x, void * y, int64_t k); + lib.quantize_row_q4_0.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int64 + ] + lib.quantize_row_q4_0.restype = None + + return lib + +def quantize_tensor(lib, tensor_data): + # tensor_data is a numpy array of float32 + k = tensor_data.size + + if k % 32 != 0: + print(f"Warning: Tensor size {k} is not a multiple of 32. Skipping quantization.") + return None + + # QK4_0 = 32 + # block_q4_0 size = sizeof(ggml_half) + QK4_0 / 2 = 2 + 16 = 18 bytes + block_size = 18 + num_blocks = k // 32 + output_size = num_blocks * block_size + + output_buffer = (ctypes.c_byte * output_size)() + + # Create a pointer to the input data + input_ptr = tensor_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + + # Call the C function + # quantize_row_q4_0 processes the whole row (k elements) + lib.quantize_row_q4_0(input_ptr, output_buffer, ctypes.c_int64(k)) + + return bytearray(output_buffer) + +def main(): + if len(sys.argv) != 4: + print("Usage: python -m oquantize ") + print("Supported quant_type: q4_0") + sys.exit(1) + + quant_type = sys.argv[1] + input_path = sys.argv[2] + output_path = sys.argv[3] + + if quant_type != "q4_0": + print(f"Error: Unsupported quantization type '{quant_type}'. Only 'q4_0' is supported.") + sys.exit(1) + + if not os.path.exists(input_path): + print(f"Error: Input file {input_path} does not exist.") + sys.exit(1) + + lib = load_ggml_library() + + print(f"Loading {input_path}...") + with open(input_path, 'rb') as f: + buf = f.read() + + model = circle.Model.GetRootAs(buf, 0) + model_t = circle.ModelT.InitFromObj(model) + + quantized_count = 0 + + for subgraph in model_t.subgraphs: + for op in subgraph.operators: + target_tensor_idx = -1 + + if op.opcodeIndex < len(model_t.operatorCodes): + op_code = model_t.operatorCodes[op.opcodeIndex] + builtin_code = op_code.builtinCode + + if builtin_code == circle.BuiltinOperator.GATHER: + # GATHER: input 0 is params (weights) + if len(op.inputs) > 0: + target_tensor_idx = op.inputs[0] + elif builtin_code == circle.BuiltinOperator.FULLY_CONNECTED: + # FULLY_CONNECTED: input 1 is weights + if len(op.inputs) > 1: + target_tensor_idx = op.inputs[1] + + if target_tensor_idx != -1: + tensor = subgraph.tensors[target_tensor_idx] + + if tensor.type == circle.TensorType.FLOAT32: + buffer_idx = tensor.buffer + if buffer_idx < len(model_t.buffers): + buffer_obj = model_t.buffers[buffer_idx] + + # Check if buffer has data + if buffer_obj.data is not None: + # Convert to numpy array + # buffer_obj.data is a list of ints (bytes) or numpy array + # circle.py generated code usually behaves like this: + # if InitFromObj used numpy, it might be numpy. + # Let's assume it's a list of uint8 or similar. + + data_bytes = bytes(buffer_obj.data) + tensor_data = np.frombuffer(data_bytes, dtype=np.float32) + + print(f"Quantizing tensor {target_tensor_idx} (size={tensor_data.size})...") + + quantized_data = quantize_tensor(lib, tensor_data) + + if quantized_data is not None: + # Update buffer + buffer_obj.data = list(quantized_data) # FlatBuffers python expects list of ints for ubyte vector? + # Or numpy array? circle.py: + # if np is not None and type(self.data) is np.ndarray: builder.CreateNumpyVector(self.data) + # So we can set it to numpy array of uint8 + buffer_obj.data = np.frombuffer(quantized_data, dtype=np.uint8) + + # Update tensor type + tensor.type = circle.TensorType.GGML_Q4_0 + quantized_count += 1 + + if quantized_count > 0: + print(f"Quantized {quantized_count} tensors.") + print(f"Saving to {output_path}...") + + builder = flatbuffers.Builder(1024) + model_offset = model_t.Pack(builder) + builder.Finish(model_offset, file_identifier=b'CIR0') + + with open(output_path, 'wb') as f: + f.write(builder.Output()) + print("Done.") + else: + print("No tensors quantized.") + +if __name__ == "__main__": + main() diff --git a/tools/oquantize/setup.py b/tools/oquantize/setup.py new file mode 100644 index 00000000000..f8e4691482e --- /dev/null +++ b/tools/oquantize/setup.py @@ -0,0 +1,129 @@ +import os +import subprocess +import sys +from setuptools import setup, find_packages +from setuptools.command.build_py import build_py + +import shutil + +def find_flatc(): + # 1. Check FLATC_PATH environment variable + flatc_env = os.environ.get('FLATC_PATH') + if flatc_env and os.path.isfile(flatc_env) and os.access(flatc_env, os.X_OK): + return flatc_env + + # 2. Check system PATH + flatc_path = shutil.which('flatc') + if flatc_path: + return flatc_path + + # 3. Check common build locations + script_dir = os.path.dirname(os.path.abspath(__file__)) + possible_paths = [ + os.path.join(script_dir, '../../build/release/overlay/bin/flatc'), + os.path.join(script_dir, '../../build/debug/overlay/bin/flatc'), + ] + + for path in possible_paths: + if os.path.isfile(path) and os.access(path, os.X_OK): + return path + + return None + +def generate_circle_py(): + flatc_path = find_flatc() + if not flatc_path: + print("Error: flatc not found. Please set FLATC_PATH environment variable or ensure flatc is in your PATH or build directory.") + sys.exit(1) + + script_dir = os.path.dirname(os.path.abspath(__file__)) + schema_path = os.path.abspath(os.path.join(script_dir, '../../runtime/libs/circle-schema/circle_schema.fbs')) + + if not os.path.exists(schema_path): + print(f"Error: Schema file not found at {schema_path}") + sys.exit(1) + + output_dir = script_dir + + print(f"Generating circle.py using {flatc_path} from {schema_path}...") + cmd = [ + flatc_path, + '--python', + '--gen-object-api', + '--gen-onefile', + '-o', output_dir, + schema_path + ] + + try: + subprocess.run(cmd, check=True) + generated_file = os.path.join(output_dir, 'circle_schema_generated.py') + target_file = os.path.join(output_dir, 'circle.py') + os.rename(generated_file, target_file) + print("Successfully generated circle.py") + except: + print(f"Failed to generate circle.py") + sys.exit(1) + +def compile_ggml_lib(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + ggml_src_dir = os.path.abspath(os.path.join(script_dir, '../../runtime/3rdparty/ggml/src')) + lib_dir = os.path.join(script_dir, 'lib') + lib_name = 'libggml_quant.so' + lib_path = os.path.join(lib_dir, lib_name) + + if not os.path.exists(lib_dir): + os.makedirs(lib_dir) + + print(f"Compiling {lib_name} from {ggml_src_dir}...") + + cmd = [ + 'gcc', + '-shared', + '-fPIC', + '-O3', + '-o', lib_path, + os.path.join(ggml_src_dir, 'ggml-quants.c'), + os.path.join(ggml_src_dir, 'ggml-aarch64.c'), + os.path.join(ggml_src_dir, 'ggml.c'), + '-I', ggml_src_dir, + '-I', os.path.abspath(os.path.join(ggml_src_dir, '../include')), + '-lm' + ] + + print("Running command:", " ".join(cmd)) + try: + subprocess.check_call(cmd) + print(f"Successfully compiled {lib_path}") + except subprocess.CalledProcessError as e: + print(f"Failed to compile {lib_name}: {e}") + sys.exit(1) + +class CustomBuildPy(build_py): + def run(self): + generate_circle_py() + compile_ggml_lib() + super().run() + +if __name__ == "__main__": + if len(sys.argv) == 1: + # Direct execution: python setup.py + # Compile in-place + generate_circle_py() + compile_ggml_lib() + else: + # Standard setuptools execution + setup( + name='oquantize', + version='0.1.0', + packages=['oquantize'], + package_dir={'oquantize': '.'}, + include_package_data=True, + cmdclass={ + 'build_py': CustomBuildPy, + }, + install_requires=[ + 'numpy', + 'flatbuffers', + ], + ) From 7f2ed2adfe1b1cd69bfe1c79b34c8e53e63dd832 Mon Sep 17 00:00:00 2001 From: Sanggyu Lee Date: Fri, 21 Nov 2025 11:10:59 +0900 Subject: [PATCH 2/2] Update coding style --- tools/oquantize/main.py | 27 +++++++++++++++++++-------- tools/oquantize/setup.py | 33 ++++++++++++++++----------------- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/tools/oquantize/main.py b/tools/oquantize/main.py index 2f936049b7f..5744bb5df0f 100644 --- a/tools/oquantize/main.py +++ b/tools/oquantize/main.py @@ -9,9 +9,12 @@ try: import circle except ImportError: - print("Error: Could not import 'circle'. Make sure tools/o2o is in PYTHONPATH or the script is run from the correct location.") + print( + "Error: Could not import 'circle'. Make sure tools/o2o is in PYTHONPATH or the script is run from the correct location." + ) sys.exit(1) + def load_ggml_library(): lib_path = os.path.join(os.path.dirname(__file__), 'lib', 'libggml_quant.so') if not os.path.exists(lib_path): @@ -22,14 +25,13 @@ def load_ggml_library(): # void quantize_row_q4_0(const float * x, void * y, int64_t k); lib.quantize_row_q4_0.argtypes = [ - ctypes.POINTER(ctypes.c_float), - ctypes.c_void_p, - ctypes.c_int64 + ctypes.POINTER(ctypes.c_float), ctypes.c_void_p, ctypes.c_int64 ] lib.quantize_row_q4_0.restype = None return lib + def quantize_tensor(lib, tensor_data): # tensor_data is a numpy array of float32 k = tensor_data.size @@ -55,6 +57,7 @@ def quantize_tensor(lib, tensor_data): return bytearray(output_buffer) + def main(): if len(sys.argv) != 4: print("Usage: python -m oquantize ") @@ -66,7 +69,9 @@ def main(): output_path = sys.argv[3] if quant_type != "q4_0": - print(f"Error: Unsupported quantization type '{quant_type}'. Only 'q4_0' is supported.") + print( + f"Error: Unsupported quantization type '{quant_type}'. Only 'q4_0' is supported." + ) sys.exit(1) if not os.path.exists(input_path): @@ -120,17 +125,22 @@ def main(): data_bytes = bytes(buffer_obj.data) tensor_data = np.frombuffer(data_bytes, dtype=np.float32) - print(f"Quantizing tensor {target_tensor_idx} (size={tensor_data.size})...") + print( + f"Quantizing tensor {target_tensor_idx} (size={tensor_data.size})..." + ) quantized_data = quantize_tensor(lib, tensor_data) if quantized_data is not None: # Update buffer - buffer_obj.data = list(quantized_data) # FlatBuffers python expects list of ints for ubyte vector? + buffer_obj.data = list( + quantized_data + ) # FlatBuffers python expects list of ints for ubyte vector? # Or numpy array? circle.py: # if np is not None and type(self.data) is np.ndarray: builder.CreateNumpyVector(self.data) # So we can set it to numpy array of uint8 - buffer_obj.data = np.frombuffer(quantized_data, dtype=np.uint8) + buffer_obj.data = np.frombuffer(quantized_data, + dtype=np.uint8) # Update tensor type tensor.type = circle.TensorType.GGML_Q4_0 @@ -150,5 +160,6 @@ def main(): else: print("No tensors quantized.") + if __name__ == "__main__": main() diff --git a/tools/oquantize/setup.py b/tools/oquantize/setup.py index f8e4691482e..0c5a57fb938 100644 --- a/tools/oquantize/setup.py +++ b/tools/oquantize/setup.py @@ -6,6 +6,7 @@ import shutil + def find_flatc(): # 1. Check FLATC_PATH environment variable flatc_env = os.environ.get('FLATC_PATH') @@ -30,14 +31,18 @@ def find_flatc(): return None + def generate_circle_py(): flatc_path = find_flatc() if not flatc_path: - print("Error: flatc not found. Please set FLATC_PATH environment variable or ensure flatc is in your PATH or build directory.") + print( + "Error: flatc not found. Please set FLATC_PATH environment variable or ensure flatc is in your PATH or build directory." + ) sys.exit(1) script_dir = os.path.dirname(os.path.abspath(__file__)) - schema_path = os.path.abspath(os.path.join(script_dir, '../../runtime/libs/circle-schema/circle_schema.fbs')) + schema_path = os.path.abspath( + os.path.join(script_dir, '../../runtime/libs/circle-schema/circle_schema.fbs')) if not os.path.exists(schema_path): print(f"Error: Schema file not found at {schema_path}") @@ -47,11 +52,7 @@ def generate_circle_py(): print(f"Generating circle.py using {flatc_path} from {schema_path}...") cmd = [ - flatc_path, - '--python', - '--gen-object-api', - '--gen-onefile', - '-o', output_dir, + flatc_path, '--python', '--gen-object-api', '--gen-onefile', '-o', output_dir, schema_path ] @@ -65,9 +66,11 @@ def generate_circle_py(): print(f"Failed to generate circle.py") sys.exit(1) + def compile_ggml_lib(): script_dir = os.path.dirname(os.path.abspath(__file__)) - ggml_src_dir = os.path.abspath(os.path.join(script_dir, '../../runtime/3rdparty/ggml/src')) + ggml_src_dir = os.path.abspath( + os.path.join(script_dir, '../../runtime/3rdparty/ggml/src')) lib_dir = os.path.join(script_dir, 'lib') lib_name = 'libggml_quant.so' lib_path = os.path.join(lib_dir, lib_name) @@ -78,17 +81,11 @@ def compile_ggml_lib(): print(f"Compiling {lib_name} from {ggml_src_dir}...") cmd = [ - 'gcc', - '-shared', - '-fPIC', - '-O3', - '-o', lib_path, + 'gcc', '-shared', '-fPIC', '-O3', '-o', lib_path, os.path.join(ggml_src_dir, 'ggml-quants.c'), os.path.join(ggml_src_dir, 'ggml-aarch64.c'), - os.path.join(ggml_src_dir, 'ggml.c'), - '-I', ggml_src_dir, - '-I', os.path.abspath(os.path.join(ggml_src_dir, '../include')), - '-lm' + os.path.join(ggml_src_dir, 'ggml.c'), '-I', ggml_src_dir, '-I', + os.path.abspath(os.path.join(ggml_src_dir, '../include')), '-lm' ] print("Running command:", " ".join(cmd)) @@ -99,12 +96,14 @@ def compile_ggml_lib(): print(f"Failed to compile {lib_name}: {e}") sys.exit(1) + class CustomBuildPy(build_py): def run(self): generate_circle_py() compile_ggml_lib() super().run() + if __name__ == "__main__": if len(sys.argv) == 1: # Direct execution: python setup.py