Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tools/oquantize/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include oquantize/lib/*.so
46 changes: 46 additions & 0 deletions tools/oquantize/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Circle Model Quantization with GGML

This tool quantizes Circle models using the GGML library.

## Prerequisites
- `gcc` installed
- `flatc` (FlatBuffers compiler) must be available
- Set `FLATC_PATH` if `flatc` is not in your PATH or standard build locations

## Building the Tool

The tool is structured as a Python package `oquantize` located in `tools/oquantize`.
It includes a C extension that needs to be compiled and generates `circle.py` from schema.

```bash
cd tools/oquantize
python3 setup.py
```

This compiles `libggml_quant.so` from the GGML source files and generates `circle.py`.

## Running the Tool

To quantize a Circle model, run the `oquantize` package from the `tools` directory:

```bash
cd tools
# Usage: python -m oquantize <quant_type> <input_circle> <output_circle>
python3 -m oquantize q4_0 prefill.circle prefill.q4.circle
python3 -m oquantize q4_0 decode.circle decode.q4.circle
```

### File Size Comparison

| File | Original Size | Quantized Size | Reduction |
|------|---------------|----------------|-----------|
| prefill.circle | 18M | 2.7M | ~85% |
| decode.circle | 18M | 2.7M | ~85% |

(Note: significant reduction is observed due to FP32 -> Q4_0 quantization).

## Implementation Details
- **Package Structure**: `tools/oquantize/`
- **C Extension**: `libggml_quant.so` compiled from `ggml-quants.c`, `ggml-aarch64.c`, and `ggml.c`
- **Quantization**: Row-wise `GGML_Q4_0` quantization for `GATHER` (input 0) and `FULLY_CONNECTED` (input 1) weights
- **Schema**: `circle.py` generated from `runtime/libs/circle-schema/circle_schema.fbs` using `flatc --python --gen-object-api --gen-onefile`
Empty file added tools/oquantize/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions tools/oquantize/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .main import main

if __name__ == "__main__":
main()
165 changes: 165 additions & 0 deletions tools/oquantize/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import ctypes
import os
import sys
import numpy as np
import flatbuffers

# Add tools/o2o to sys.path to import circle
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../o2o')))
try:
import circle
except ImportError:
print(
"Error: Could not import 'circle'. Make sure tools/o2o is in PYTHONPATH or the script is run from the correct location."
)
sys.exit(1)


def load_ggml_library():
lib_path = os.path.join(os.path.dirname(__file__), 'lib', 'libggml_quant.so')
if not os.path.exists(lib_path):
print(f"Error: {lib_path} not found. Please build the package first.")
sys.exit(1)

lib = ctypes.CDLL(lib_path)

# void quantize_row_q4_0(const float * x, void * y, int64_t k);
lib.quantize_row_q4_0.argtypes = [
ctypes.POINTER(ctypes.c_float), ctypes.c_void_p, ctypes.c_int64
]
lib.quantize_row_q4_0.restype = None

return lib


def quantize_tensor(lib, tensor_data):
# tensor_data is a numpy array of float32
k = tensor_data.size

if k % 32 != 0:
print(f"Warning: Tensor size {k} is not a multiple of 32. Skipping quantization.")
return None

# QK4_0 = 32
# block_q4_0 size = sizeof(ggml_half) + QK4_0 / 2 = 2 + 16 = 18 bytes
block_size = 18
num_blocks = k // 32
output_size = num_blocks * block_size

output_buffer = (ctypes.c_byte * output_size)()

# Create a pointer to the input data
input_ptr = tensor_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

# Call the C function
# quantize_row_q4_0 processes the whole row (k elements)
lib.quantize_row_q4_0(input_ptr, output_buffer, ctypes.c_int64(k))

return bytearray(output_buffer)


def main():
if len(sys.argv) != 4:
print("Usage: python -m oquantize <quant_type> <input_circle> <output_circle>")
print("Supported quant_type: q4_0")
sys.exit(1)

quant_type = sys.argv[1]
input_path = sys.argv[2]
output_path = sys.argv[3]

if quant_type != "q4_0":
print(
f"Error: Unsupported quantization type '{quant_type}'. Only 'q4_0' is supported."
)
sys.exit(1)

if not os.path.exists(input_path):
print(f"Error: Input file {input_path} does not exist.")
sys.exit(1)

lib = load_ggml_library()

print(f"Loading {input_path}...")
with open(input_path, 'rb') as f:
buf = f.read()

model = circle.Model.GetRootAs(buf, 0)
model_t = circle.ModelT.InitFromObj(model)

quantized_count = 0

for subgraph in model_t.subgraphs:
for op in subgraph.operators:
target_tensor_idx = -1

if op.opcodeIndex < len(model_t.operatorCodes):
op_code = model_t.operatorCodes[op.opcodeIndex]
builtin_code = op_code.builtinCode

if builtin_code == circle.BuiltinOperator.GATHER:
# GATHER: input 0 is params (weights)
if len(op.inputs) > 0:
target_tensor_idx = op.inputs[0]
elif builtin_code == circle.BuiltinOperator.FULLY_CONNECTED:
# FULLY_CONNECTED: input 1 is weights
if len(op.inputs) > 1:
target_tensor_idx = op.inputs[1]

if target_tensor_idx != -1:
tensor = subgraph.tensors[target_tensor_idx]

if tensor.type == circle.TensorType.FLOAT32:
buffer_idx = tensor.buffer
if buffer_idx < len(model_t.buffers):
buffer_obj = model_t.buffers[buffer_idx]

# Check if buffer has data
if buffer_obj.data is not None:
# Convert to numpy array
# buffer_obj.data is a list of ints (bytes) or numpy array
# circle.py generated code usually behaves like this:
# if InitFromObj used numpy, it might be numpy.
# Let's assume it's a list of uint8 or similar.

data_bytes = bytes(buffer_obj.data)
tensor_data = np.frombuffer(data_bytes, dtype=np.float32)

print(
f"Quantizing tensor {target_tensor_idx} (size={tensor_data.size})..."
)

quantized_data = quantize_tensor(lib, tensor_data)

if quantized_data is not None:
# Update buffer
buffer_obj.data = list(
quantized_data
) # FlatBuffers python expects list of ints for ubyte vector?
# Or numpy array? circle.py:
# if np is not None and type(self.data) is np.ndarray: builder.CreateNumpyVector(self.data)
# So we can set it to numpy array of uint8
buffer_obj.data = np.frombuffer(quantized_data,
dtype=np.uint8)

# Update tensor type
tensor.type = circle.TensorType.GGML_Q4_0
quantized_count += 1

if quantized_count > 0:
print(f"Quantized {quantized_count} tensors.")
print(f"Saving to {output_path}...")

builder = flatbuffers.Builder(1024)
model_offset = model_t.Pack(builder)
builder.Finish(model_offset, file_identifier=b'CIR0')

with open(output_path, 'wb') as f:
f.write(builder.Output())
print("Done.")
else:
print("No tensors quantized.")


if __name__ == "__main__":
main()
128 changes: 128 additions & 0 deletions tools/oquantize/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import os
import subprocess
import sys
from setuptools import setup, find_packages
from setuptools.command.build_py import build_py

import shutil


def find_flatc():
# 1. Check FLATC_PATH environment variable
flatc_env = os.environ.get('FLATC_PATH')
if flatc_env and os.path.isfile(flatc_env) and os.access(flatc_env, os.X_OK):
return flatc_env

# 2. Check system PATH
flatc_path = shutil.which('flatc')
if flatc_path:
return flatc_path

# 3. Check common build locations
script_dir = os.path.dirname(os.path.abspath(__file__))
possible_paths = [
os.path.join(script_dir, '../../build/release/overlay/bin/flatc'),
os.path.join(script_dir, '../../build/debug/overlay/bin/flatc'),
]

for path in possible_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path

return None


def generate_circle_py():
flatc_path = find_flatc()
if not flatc_path:
print(
"Error: flatc not found. Please set FLATC_PATH environment variable or ensure flatc is in your PATH or build directory."
)
sys.exit(1)

script_dir = os.path.dirname(os.path.abspath(__file__))
schema_path = os.path.abspath(
os.path.join(script_dir, '../../runtime/libs/circle-schema/circle_schema.fbs'))

if not os.path.exists(schema_path):
print(f"Error: Schema file not found at {schema_path}")
sys.exit(1)

output_dir = script_dir

print(f"Generating circle.py using {flatc_path} from {schema_path}...")
cmd = [
flatc_path, '--python', '--gen-object-api', '--gen-onefile', '-o', output_dir,
schema_path
]

try:
subprocess.run(cmd, check=True)
generated_file = os.path.join(output_dir, 'circle_schema_generated.py')
target_file = os.path.join(output_dir, 'circle.py')
os.rename(generated_file, target_file)
print("Successfully generated circle.py")
except:
print(f"Failed to generate circle.py")
sys.exit(1)


def compile_ggml_lib():
script_dir = os.path.dirname(os.path.abspath(__file__))
ggml_src_dir = os.path.abspath(
os.path.join(script_dir, '../../runtime/3rdparty/ggml/src'))
lib_dir = os.path.join(script_dir, 'lib')
lib_name = 'libggml_quant.so'
lib_path = os.path.join(lib_dir, lib_name)

if not os.path.exists(lib_dir):
os.makedirs(lib_dir)

print(f"Compiling {lib_name} from {ggml_src_dir}...")

cmd = [
'gcc', '-shared', '-fPIC', '-O3', '-o', lib_path,
os.path.join(ggml_src_dir, 'ggml-quants.c'),
os.path.join(ggml_src_dir, 'ggml-aarch64.c'),
os.path.join(ggml_src_dir, 'ggml.c'), '-I', ggml_src_dir, '-I',
os.path.abspath(os.path.join(ggml_src_dir, '../include')), '-lm'
]

print("Running command:", " ".join(cmd))
try:
subprocess.check_call(cmd)
print(f"Successfully compiled {lib_path}")
except subprocess.CalledProcessError as e:
print(f"Failed to compile {lib_name}: {e}")
sys.exit(1)


class CustomBuildPy(build_py):
def run(self):
generate_circle_py()
compile_ggml_lib()
super().run()


if __name__ == "__main__":
if len(sys.argv) == 1:
# Direct execution: python setup.py
# Compile in-place
generate_circle_py()
compile_ggml_lib()
else:
# Standard setuptools execution
setup(
name='oquantize',
version='0.1.0',
packages=['oquantize'],
package_dir={'oquantize': '.'},
include_package_data=True,
cmdclass={
'build_py': CustomBuildPy,
},
install_requires=[
'numpy',
'flatbuffers',
],
)