z-lab
diff --git a/‎.github/workflows/build-docker-images.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-docker-images.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 28 additions & 8 deletions b/‎README.md‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 0 deletions b/‎docker/Dockerfile‎
Lines changed: 2 additions & 0 deletions
@@ -30,15 +30,15 @@ jobs:
             target: chat
             cuda_version: "13.0.2"
             torch_cuda_arch_list: "8.0 8.6 8.7 8.9 9.0 10.0 12.0 12.1"
-            vllm_version: "0.15.1"
+            vllm_version: "0.17.0"
             cuda_toolkit: "cu130"
             platforms: linux/amd64,linux/arm64
 
           - tag: serve
             target: serve
             cuda_version: "13.0.2"
             torch_cuda_arch_list: "8.0 8.6 8.7 8.9 9.0 10.0 12.0 12.1"
-            vllm_version: "0.15.1"
+            vllm_version: "0.17.0"
             cuda_toolkit: "cu130"
             platforms: linux/amd64,linux/arm64
 
 
@@ -19,35 +19,55 @@ State-of-the-art INT4 quantization for LLMs. ParoQuant uses learned pairwise rot
 
 ## Quick Start
 
-### Interactive Chat
+### Installation
 
 ```bash
 # NVIDIA GPU
 pip install "paroquant[vllm]"
-python -m paroquant.cli.chat --model z-lab/Qwen3-8B-PARO
 
 # Apple Silicon
 pip install "paroquant[mlx]"
-python -m paroquant.cli.chat --model z-lab/Qwen3-8B-PARO
+```
+
+Pick a model from our [Hugging Face collection](https://huggingface.co/collections/z-lab/paroquant):
+
+```bash
+export MODEL=z-lab/Qwen3.5-4B-PARO
+```
+
+### Interactive Chat
+
+```bash
+python -m paroquant.cli.chat --model $MODEL
 ```
 
 ### OpenAI-Compatible API Server
 
 ```bash
-pip install "paroquant[vllm]"
-python -m paroquant.cli.serve --model z-lab/Qwen3-8B-PARO
+python -m paroquant.cli.serve --model $MODEL --port 8000
+```
+
+### Agent with Tool Calling
+
+Start the API server first, then install the agent dependencies and run:
+
+```bash
+pip install "paroquant[agent]"
+python -m paroquant.cli.agent --model $MODEL
 ```
 
-### Docker
+Tool use (web fetch, filesystem, time) requires [uv](https://docs.astral.sh/uv/) and [Node.js](https://nodejs.org/en/download).
+
+### Docker (NVIDIA GPU)
 
 ```bash
 # Interactive chat
 docker run --pull=always --rm -it --gpus all --ipc=host \
-  ghcr.io/z-lab/paroquant:chat --model z-lab/Qwen3-8B-PARO
+  ghcr.io/z-lab/paroquant:chat --model $MODEL
 
 # API server (port 8000)
 docker run --pull=always --rm -it --gpus all --ipc=host -p 8000:8000 \
-  ghcr.io/z-lab/paroquant:serve --model z-lab/Qwen3-8B-PARO
+  ghcr.io/z-lab/paroquant:serve --model $MODEL
 ```
 
 ## Models
 
@@ -55,6 +55,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         pip install -e ".[vllm]"; \
     fi
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+ENV TRITON_PTXAS_BLACKWELL_PATH=/usr/local/cuda/bin/ptxas
 ENTRYPOINT ["python", "-m", "paroquant.cli.chat"]
 
 # ---- serve: OpenAI-compatible vLLM API server ----
@@ -72,6 +73,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         pip install -e ".[vllm]"; \
     fi
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+ENV TRITON_PTXAS_BLACKWELL_PATH=/usr/local/cuda/bin/ptxas
 ENTRYPOINT ["python", "-m", "paroquant.cli.serve"]
 
 # ---- optim: optimization & evaluation ----