diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
deleted file mode 100644
index c4551dc..0000000
--- a/.github/workflows/build.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
----
-name: Build quantize binary
-
-permissions:
-    contents: read
-
-on:
-    push:
-        branches:
-            - main
-        tags:
-            - "v*"
-    pull_request:
-        branches:
-            - main
-    workflow_dispatch:
-
-jobs:
-    macos-build:
-        name: "Build quantize on macOS ARM64 (M1)"
-        runs-on: "macos-14"
-        steps:
-            - uses: "actions/checkout@v4"
-              with:
-                  submodules: true
-
-            - name: system info
-              run: sysctl -a
-
-            - name: make build/quantize from llama.cpp sources
-              env:
-                  CMAKE_ARGS: "-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON"
-              run: make quantize
-
-            - name: file info
-              run: file build/quantize-arm64-darwin
-
-            - name: test quantize
-              run: |
-                  build/quantize-arm64-darwin \
-                      llama.cpp/models/ggml-vocab-llama.gguf \
-                      /tmp/ggml-vocab-Q4_K_M.gguf \
-                      Q4_K_M
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "quantize-arm64-darwin"
-                  path: build/quantize-arm64-darwin
-
-    linux-build:
-        name: "Build quantize on Linux for ${{ matrix.arch }}"
-        runs-on: "ubuntu-latest"
-        strategy:
-            fail-fast: true
-            matrix:
-                include:
-                    - arch: "amd64"
-                      suffix: "x86_64-linux"
-                      image: quay.io/sclorg/python-312-c8s:c8s
-                    - arch: "arm64"
-                      suffix: "aarch64-linux"
-                      image: quay.io/sclorg/python-312-c8s:c8s
-        steps:
-            - uses: "actions/checkout@v4"
-              with:
-                  submodules: true
-
-            - name: Set up QEMU
-              uses: docker/setup-qemu-action@v3
-
-            - name: Set up Docker Buildx
-              uses: docker/setup-buildx-action@v3
-
-            - name: Pull ${{ matrix.image }} for linux/${{ matrix.arch }}
-              run: |
-                  docker pull --platform linux/${{ matrix.arch }} ${{ matrix.image }}
-
-            - name: make build/quantize from llama.cpp sources
-              run: |
-                  set -e
-                  docker run --platform linux/${{ matrix.arch }} ${{ matrix.image }} uname -a
-                  docker run --platform linux/${{ matrix.arch }} \
-                        -v .:/opt/app-root/src \
-                        -e CMAKE_ARGS="-DLLAMA_FATAL_WARNINGS=ON" \
-                        ${{ matrix.image }} \
-                        make quantize
-
-            - name: file info
-              run: file build/quantize-${{ matrix.suffix }}
-
-            - name: file symbols
-              run: nm -a build/quantize-${{ matrix.suffix }} | grep -o "GLIBC.*" | sort -u
-
-            - name: test quantize
-              run: |
-                  docker run --platform linux/${{ matrix.arch }} \
-                      -v .:/opt/app-root/src \
-                      ${{ matrix.image }} \
-                      build/quantize-${{ matrix.suffix }} \
-                          llama.cpp/models/ggml-vocab-llama.gguf \
-                          /tmp/ggml-vocab-Q4_K_M.gguf \
-                          Q4_K_M
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "quantize-${{ matrix.suffix }}"
-                  path: build/quantize-${{ matrix.suffix }}
-
-    merge-artifacts:
-        name: Merge artifacts
-        runs-on: ubuntu-latest
-        needs:
-            - macos-build
-            - linux-build
-        steps:
-            - name: Merge artifacts
-              uses: actions/upload-artifact/merge@v4
-              with:
-                  name: quantize
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index d830efa..4fa14e4 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -14,12 +14,6 @@ on:
         types:
             - published
 
-permissions:
-    # allow gh release upload
-    contents: write
-    # see https://docs.pypi.org/trusted-publishers/
-    id-token: write
-
 jobs:
     build-package:
         name: Build and check packages
@@ -29,27 +23,73 @@ jobs:
               with:
                   # for setuptools-scm
                   fetch-depth: 0
+                  submodules: true
 
             - uses: hynek/build-and-inspect-python-package@v2
+              with:
+                  skip-wheel: 'false'
+
+    build_wheels:
+        name: Build wheels on ${{ matrix.os }}
+        runs-on: ${{ matrix.os }}
+        strategy:
+            matrix:
+                # macos-13 is an intel runner, macos-14 is apple silicon
+                os: [ubuntu-latest, ubuntu-24.04-arm, macos-14]
+
+        steps:
+            - uses: actions/checkout@v4
+              with:
+                  # for setuptools-scm
+                  fetch-depth: 0
+                  submodules: true
+
+            - name: Build wheels
+              uses: pypa/cibuildwheel@v2.22.0
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+                  path: ./wheelhouse/*.whl
+
+    build_sdist:
+        name: Build source distribution
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+              with:
+                  # for setuptools-scm
+                  fetch-depth: 0
+                  submodules: true
+
+            - name: Build sdist
+              run: pipx run build --sdist
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: cibw-sdist
+                  path: dist/*.tar.gz
 
     publish-test-pypi:
         name: Publish packages to test.pypi.org
         # environment: publish-test-pypi
-        # TODO: move to instructlab
+        permissions:
+            id-token: write
         if: |
-            github.repository_owner == 'tiran' && (
+            github.repository_owner == 'instructlab' && (
                 github.event.action == 'published' ||
                 (github.event_name == 'push' && github.ref == 'refs/heads/main')
             )
         runs-on: ubuntu-latest
-        needs: build-package
+        needs: [build_wheels, build_sdist]
 
         steps:
             - name: Fetch build artifacts
               uses: actions/download-artifact@v4
               with:
-                  name: Packages
+                  pattern: cibw-*
                   path: dist
+                  merge-multiple: true
 
             - name: Upload to Test PyPI
               uses: pypa/gh-action-pypi-publish@release/v1
@@ -59,20 +99,23 @@ jobs:
     publish-pypi:
         name: Publish release to pypi.org
         # environment: publish-pypi
-        # TODO: move to instructlab
+        permissions:
+            contents: write
+            id-token: write
         if: |
-            github.repository_owner == 'tiran' && github.event.action == 'published'
+            github.repository_owner == 'instructlab' && github.event.action == 'published'
         runs-on: ubuntu-latest
-        needs: build-package
+        needs: [build_wheels, build_sdist]
 
         steps:
             - name: Fetch build artifacts
               uses: actions/download-artifact@v4
               with:
-                  name: Packages
+                  pattern: cibw-*
                   path: dist
+                  merge-multiple: true
 
-            - uses: sigstore/gh-action-sigstore-python@v2.1.1
+            - uses: sigstore/gh-action-sigstore-python@v3.0.0
               with:
                   inputs: >-
                       ./dist/*.tar.gz
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 04fa1ca..31726ee 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,11 +26,13 @@ jobs:
                     - "3.10"
                     - "3.11"
                     - "3.12"
-                    - "3.13-dev"
+                    - "3.13"
         steps:
             - uses: "actions/checkout@v4"
               with:
-                submodules: true
+                  submodules: true
+                  # for setuptools-scm
+                  fetch-depth: 0
 
             - uses: "actions/setup-python@v5"
               with:
@@ -38,7 +40,7 @@ jobs:
                   allow-prereleases: true
 
             - name: "Update pip"
-              run: python -m pip install --upgrade pip setuptools wheel
+              run: python -m pip install --upgrade pip
 
             - name: "Install tox dependencies"
               run: python -m pip install --upgrade tox tox-gh-actions
@@ -55,7 +57,7 @@ jobs:
                 submodules: true
 
             - name: "Update pip"
-              run: python -m pip install --upgrade pip setuptools wheel
+              run: python -m pip install --upgrade pip
 
             - name: "Install tox dependencies"
               run: python -m pip install --upgrade tox
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..79255cd
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,23 @@
+include tox.ini tests.py .pylintrc
+recursive-include llama.cpp *
+exclude llama.cpp/.git
+
+global-exclude gguf.inp gguf.out
+exclude llama.cpp/models/ggml-vocab-aquila.gguf
+exclude llama.cpp/models/ggml-vocab-baichuan.gguf
+exclude llama.cpp/models/ggml-vocab-bert-bge.gguf
+exclude llama.cpp/models/ggml-vocab-command-r.gguf
+exclude llama.cpp/models/ggml-vocab-deepseek-coder.gguf
+exclude llama.cpp/models/ggml-vocab-deepseek-llm.gguf
+exclude llama.cpp/models/ggml-vocab-falcon.gguf
+exclude llama.cpp/models/ggml-vocab-gpt2.gguf
+exclude llama.cpp/models/ggml-vocab-gpt-neox.gguf
+# used in tests.py
+# exclude llama.cpp/models/ggml-vocab-llama-bpe.gguf
+exclude llama.cpp/models/ggml-vocab-llama-spm.gguf
+exclude llama.cpp/models/ggml-vocab-mpt.gguf
+exclude llama.cpp/models/ggml-vocab-phi-3.gguf
+exclude llama.cpp/models/ggml-vocab-qwen2.gguf
+exclude llama.cpp/models/ggml-vocab-refact.gguf
+exclude llama.cpp/models/ggml-vocab-stablelm-3b-4e1t.gguf
+exclude llama.cpp/models/ggml-vocab-starcoder.gguf
diff --git a/Makefile b/Makefile
deleted file mode 100644
index d93152e..0000000
--- a/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-CMAKE_ARGS ?=
-
-UNAME_MACHINE = $(shell uname -m | tr A-Z a-z)
-UNAME_OS = $(shell uname -s | tr A-Z a-z)
-QUANTIZE = build/quantize-$(UNAME_MACHINE)-$(UNAME_OS)
-LLAMA_BUILDDIR = build/llama.cpp-$(UNAME_MACHINE)-$(UNAME_OS)
-LLAMA_DIR = llama.cpp
-
-
-.PHONY: all
-all: test $(QUANTIZE)
-
-.PHONY: test
-test:
-	tox p
-
-.PHONY: fix
-fix:
-	tox -e format --
-	tox -e ruff -- --fix
-
-.PHONY: clean
-clean:
-	rm -rf .tox .ruff_cache dist build
-
-$(LLAMA_BUILDDIR)/Makefile: $(LLAMA_DIR)/CMakeLists.txt
-	@mkdir -p $(dir $@)
-	CMAKE_ARGS="$(CMAKE_ARGS)" cmake -S $(dir $<) -B $(dir $@)
-
-$(LLAMA_BUILDDIR)/bin/quantize: $(LLAMA_BUILDDIR)/Makefile
-	cmake --build $(dir $<) --parallel 2 --config Release --target quantize
-
-.PHONY: quantize
-quantize: $(QUANTIZE)
-
-$(QUANTIZE): $(LLAMA_BUILDDIR)/bin/quantize
-	cp -a $< $@
diff --git a/llama.cpp b/llama.cpp
index 784e11d..b95c8af 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 784e11dea1f5ce9638851b2b0dddb107e2a609c8
+Subproject commit b95c8af37ccf169b0a3216b7ed691af0534e5091
diff --git a/pyproject.toml b/pyproject.toml
index cea6f47..ca46d09 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [build-system]
-requires = ["setuptools>=64", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -34,10 +34,12 @@ dynamic = ["version"]
 [project.urls]
 # TODO: move the project to instructlab
 # homepage = "https://instructlab.io"
-source = "https://github.com/tiran/instructlab-quantize"
-issues = "https://github.com/tiran/instructlab-quantize/issues"
+source = "https://github.com/instructlab/instructlab-quantize"
+issues = "https://github.com/instructlab/instructlab-quantize/issues"
 
 [tool.setuptools_scm]
+# do not include +gREV local version, required for Test PyPI upload
+local_scheme = "no-local-version"
 
 [tool.setuptools]
 package-dir = {"" = "src"}
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..d0652e0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,116 @@
+import os
+import platform
+import subprocess
+import sys
+
+from setuptools import setup
+from setuptools.command.build_py import build_py
+from setuptools.dist import Distribution
+from wheel.bdist_wheel import bdist_wheel as bdist_wheel
+
+CMAKE_ARGS = [
+    "-DCMAKE_BUILD_TYPE=Release",
+    "-DBUILD_SHARED_LIBS=OFF",
+    # build with base ISA
+    "-DGGML_NATIVE=OFF",
+    "-DLLAMA_NATIVE=OFF",
+    "-DLLAMA_BUILD_TESTS=OFF",
+    "-DLLAMA_BUILD_SERVER=OFF",
+]
+CMAKE_ARGS_X86_64 = [
+    # force x86_64-v2 ISA
+    "-DGGML_AVX=OFF",
+    "-DGGML_AVX2=OFF",
+    "-DGGML_FMA=OFF",
+    "-DGGML_F16C=OFF",
+    "-DLLAMA_AVX=OFF",
+    "-DLLAMA_AVX2=OFF",
+    "-DLLAMA_FMA=OFF",
+    "-DLLAMA_F16C=OFF",
+]
+CMAKE_ARGS_DARWIN_AARCH64 = [
+    # build and embed METAL on Apple M
+    "-DGGML_METAL=ON",
+    "-DGGML_METAL_EMBED_LIBRARY=ON",
+    "-DLLAMA_METAL=ON",
+    "-DLLAMA_METAL_EMBED_LIBRARY=ON",
+]
+QUANTIZE_BINARY = "llama-quantize"
+
+
+class Py3NoneBdistWheel(bdist_wheel):
+    """Tag wheel as py3-none-{tag}"""
+
+    def finalize_options(self) -> None:
+        super().finalize_options()
+        self.root_is_pure = False
+
+    def get_tag(self) -> tuple[str, str, str]:
+        _py, _abi, plat_name = super().get_tag()
+        return "py3", "none", plat_name
+
+
+class QuantizeBuildPy(build_py):
+    """Hack to build and copy quantize binary with Python files"""
+
+    def build_quantize(self) -> None:
+        # Switch to scikit-build-core? I have not found an example how to
+        # ship a program with scikit-build-core.
+        arch = platform.uname().machine
+        build_cmd = self.get_finalized_command("build")
+        package_name = self.distribution.packages[0]
+        build_temp = build_cmd.build_temp
+        cmake_args = [
+            "cmake",
+            "-S",
+            "llama.cpp",
+            "-B",
+            build_temp,
+        ]
+        cmake_args.extend(CMAKE_ARGS)
+        if sys.platform == "darwin" and arch == "aarch64":
+            cmake_args.extend(CMAKE_ARGS_DARWIN_AARCH64)
+        elif arch == "x86_64":
+            cmake_args.extend(CMAKE_ARGS_X86_64)
+        print(f"Run {' '.join(cmake_args)}")
+        subprocess.check_call(cmake_args)
+
+        build_args = [
+            "cmake",
+            "--build",
+            build_temp,
+            "--config",
+            "Release",
+            "--target",
+            QUANTIZE_BINARY,
+        ]
+        print(f"Run {' '.join(build_args)}")
+        subprocess.check_call(build_args)
+
+        infile = os.path.join(build_temp, "bin", QUANTIZE_BINARY)
+        outname = f"quantize-{arch}-{sys.platform}"
+        outfile = os.path.join(self.build_lib, package_name, outname)
+        directory = os.path.dirname(outfile)
+        os.makedirs(directory, exist_ok=True)
+        self.copy_file(infile, outfile, preserve_mode=True)
+        self.package_data[package_name] = [outname]
+
+    def run(self) -> None:
+        self.build_quantize()
+        return super().run()
+
+
+class BinaryDistribution(Distribution):
+    """Mark package has platlib package"""
+
+    def has_ext_modules(foo) -> bool:
+        return True
+
+
+setup(
+    distclass=BinaryDistribution,
+    cmdclass={
+        "bdist_wheel": Py3NoneBdistWheel,
+        "build_py": QuantizeBuildPy,
+    },
+)
diff --git a/src/instructlab_quantize/quantize-aarch64-linux b/src/instructlab_quantize/quantize-aarch64-linux
deleted file mode 100755
index 107319d..0000000
Binary files a/src/instructlab_quantize/quantize-aarch64-linux and /dev/null differ
diff --git a/src/instructlab_quantize/quantize-arm64-darwin b/src/instructlab_quantize/quantize-arm64-darwin
deleted file mode 100755
index 83a9a9e..0000000
Binary files a/src/instructlab_quantize/quantize-arm64-darwin and /dev/null differ
diff --git a/src/instructlab_quantize/quantize-x86_64-linux b/src/instructlab_quantize/quantize-x86_64-linux
deleted file mode 100755
index 80461a4..0000000
Binary files a/src/instructlab_quantize/quantize-x86_64-linux and /dev/null differ
diff --git a/tests.py b/tests.py
index 01d120b..3afb0c7 100644
--- a/tests.py
+++ b/tests.py
@@ -7,9 +7,10 @@
 import sys
 from unittest import mock
 
-import instructlab_quantize
 import pytest
 
+import instructlab_quantize
+
 PKG_DIR = pathlib.Path(instructlab_quantize.__file__).absolute().parent
 
 
@@ -42,7 +43,7 @@ def test_run_quantize(tmp_path: pathlib.Path):
     quant_type = "Q4_K_M"
     outfile = tmp_path / "ggml-vocab-{quant_type}.gguf"
     instructlab_quantize.run_quantize(
-        "llama.cpp/models/ggml-vocab-llama.gguf",
+        "llama.cpp/models/ggml-vocab-llama-bpe.gguf",
         os.fspath(outfile),
         quant_type,
     )
diff --git a/tox.ini b/tox.ini
index c8483a3..2612559 100644
--- a/tox.ini
+++ b/tox.ini
@@ -41,6 +41,16 @@ deps =
 commands =
     ruff format {posargs:--check}
 
+[testenv:fix]
+description = fix code with Ruff
+skip_install = True
+skipsdist = true
+deps =
+    ruff
+commands =
+    ruff format
+    ruff check --fix
+
 [gh-actions]
 python =
     3.9: py39