Skip to content

Commit a111911

Browse files
authored
Merge pull request #4 from neuralmagic/package-refactor
Turn this repo into an installable python project! Now there is a python interface for quantizing, calibrating, and saving your models as FP8 Example: ```python from transformers import AutoTokenizer from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) examples = ["auto_fp8 is an easy-to-use model quantization library"] examples = tokenizer(examples, return_tensors="pt").to("cuda") quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="dynamic") model = AutoFP8ForCausalLM.from_pretrained( pretrained_model_dir, quantize_config=quantize_config ) model.quantize(examples) model.save_quantized(quantized_model_dir) ``` Then, load it into vLLM for inference! ``` pip install vllm==0.4.2 ``` ```python from vllm import LLM model = LLM("Meta-Llama-3-8B-Instruct-FP8") # INFO 05-10 18:02:40 model_runner.py:175] Loading model weights took 8.4595 GB print(model.generate("Once upon a time")) # [RequestOutput(request_id=0, prompt='Once upon a time', prompt_token_ids=[128000, 12805, 5304, 264, 892], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' there was a man who fell in love with a woman. The man was so', token_ids=[1070, 574, 264, 893, 889, 11299, 304, 3021, 449, 264, 5333, 13, 578, 893, 574, 779], cumulative_logprob=-21.314169232733548, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1715378569.478381, last_token_time=1715378569.478381, first_scheduled_time=1715378569.480648, first_token_time=1715378569.7070432, time_in_queue=0.002267122268676758, finished_time=1715378570.104807), lora_request=None)] ```
2 parents 608baca + e7ff7bf commit a111911

File tree

12 files changed

+735
-32
lines changed

12 files changed

+735
-32
lines changed

.github/workflows/test.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: test
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
pull_request:
10+
branches:
11+
- main
12+
13+
jobs:
14+
test:
15+
runs-on: ubuntu-latest
16+
strategy:
17+
matrix:
18+
python-version: ["3.8", "3.9", "3.10", "3.11"]
19+
steps:
20+
- uses: actions/checkout@v2
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v2
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
pip install .
29+
pip install pytest
30+
- name: Test
31+
run: |
32+
pytest tests -s -v

.gitignore

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
docs/source/getting_started/examples/*.rst
74+
!**/*.template.rst
75+
76+
# PyBuilder
77+
.pybuilder/
78+
target/
79+
80+
# Jupyter Notebook
81+
.ipynb_checkpoints
82+
83+
# IPython
84+
profile_default/
85+
ipython_config.py
86+
87+
# pyenv
88+
# For a library or package, you might want to ignore these files since the code is
89+
# intended to run in multiple environments; otherwise, check them in:
90+
# .python-version
91+
92+
# pipenv
93+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
95+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
96+
# install all needed dependencies.
97+
#Pipfile.lock
98+
99+
# poetry
100+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101+
# This is especially recommended for binary packages to ensure reproducibility, and is more
102+
# commonly ignored for libraries.
103+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104+
#poetry.lock
105+
106+
# pdm
107+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108+
#pdm.lock
109+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110+
# in version control.
111+
# https://pdm.fming.dev/#use-with-ide
112+
.pdm.toml
113+
114+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115+
__pypackages__/
116+
117+
# Celery stuff
118+
celerybeat-schedule
119+
celerybeat.pid
120+
121+
# SageMath parsed files
122+
*.sage.py
123+
124+
# Environments
125+
.env
126+
.venv
127+
env/
128+
venv/
129+
ENV/
130+
env.bak/
131+
venv.bak/
132+
133+
# Spyder project settings
134+
.spyderproject
135+
.spyproject
136+
137+
# Rope project settings
138+
.ropeproject
139+
140+
# mkdocs documentation
141+
/site
142+
143+
# mypy
144+
.mypy_cache/
145+
.dmypy.json
146+
dmypy.json
147+
148+
# Pyre type checker
149+
.pyre/
150+
151+
# pytype static type analyzer
152+
.pytype/
153+
154+
# Cython debug symbols
155+
cython_debug/
156+
157+
# PyCharm
158+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160+
# and can be added to the global gitignore or merged into this file. For a more nuclear
161+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162+
.idea/
163+
164+
# VSCode
165+
.vscode/
166+
167+
# DS Store
168+
.DS_Store
169+
170+
# Results
171+
*.csv
172+
173+
# Python pickle files
174+
*.pkl
175+
176+
# Sphinx documentation
177+
_build/
178+
179+
# vim swap files
180+
*.swo
181+
*.swp
182+
183+
# hip files generated by PyTorch
184+
*.hip
185+
*_hip*
186+
hip_compat.h
187+
188+
# Benchmark dataset
189+
*.json

README.md

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,57 @@
11
# AutoFP8
22

3-
Open-source FP8 quantization project for producing compressed checkpoints for running in vLLM - see https://github.com/vllm-project/vllm/pull/4332 for implementation.
3+
Open-source FP8 quantization library for producing compressed checkpoints for running in vLLM - see https://github.com/vllm-project/vllm/pull/4332 for details on the implementation for inference.
44

5-
## How to quantize a model
5+
## Installation
66

7-
Install this repo's requirements:
7+
Clone this repo and install it from source:
88
```bash
9-
pip install -r requirements.txt
9+
git clone https://github.com/neuralmagic/AutoFP8.git
10+
pip install -e AutoFP8
1011
```
1112

12-
Command to produce a `Meta-Llama-3-8B-Instruct-FP8` quantized LLM:
13-
```bash
14-
python quantize.py --model-id meta-llama/Meta-Llama-3-8B-Instruct --save-dir Meta-Llama-3-8B-Instruct-FP8
15-
```
13+
A stable release will be published.
14+
15+
## Quickstart
16+
17+
This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed.
18+
19+
Once you load your `AutoFP8ForCausalLM`, you can tokenize your data and provide it to the `model.quantize(tokenized_text)` function to calibrate+compress the model.
20+
21+
Finally, you can save your quantized model in a compressed checkpoint format compatible with vLLM using `model.save_quantized("my_model_fp8")`.
22+
23+
Here is a full example covering that flow:
24+
25+
```python
26+
from transformers import AutoTokenizer
27+
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
1628

17-
Example model checkpoint with FP8 static scales for activations and weights: https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-FP8
29+
pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
30+
quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
1831

19-
All arguments available for `quantize.py`:
32+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
33+
examples = ["auto_fp8 is an easy-to-use model quantization library"]
34+
examples = tokenizer(examples, return_tensors="pt").to("cuda")
35+
36+
quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="dynamic")
37+
38+
model = AutoFP8ForCausalLM.from_pretrained(
39+
pretrained_model_dir, quantize_config=quantize_config
40+
)
41+
model.quantize(examples)
42+
model.save_quantized(quantized_model_dir)
2043
```
21-
usage: quantize.py [-h] [--model-id MODEL_ID] [--save-dir SAVE_DIR] [--activation-scheme {static,dynamic}] [--num-samples NUM_SAMPLES] [--max-seq-len MAX_SEQ_LEN]
22-
23-
options:
24-
-h, --help show this help message and exit
25-
--model-id MODEL_ID
26-
--save-dir SAVE_DIR
27-
--activation-scheme {static,dynamic}
28-
--num-samples NUM_SAMPLES
29-
--max-seq-len MAX_SEQ_LEN
44+
45+
Finally, load it into vLLM for inference! Support began in v0.4.2 (`pip install vllm>=0.4.2`). Note that hardware support for FP8 tensor cores must be available in the GPU you are using (Ada Lovelace, Hopper, and newer).
46+
47+
```python
48+
from vllm import LLM
49+
50+
model = LLM("Meta-Llama-3-8B-Instruct-FP8")
51+
# INFO 05-10 18:02:40 model_runner.py:175] Loading model weights took 8.4595 GB
52+
53+
print(model.generate("Once upon a time"))
54+
# [RequestOutput(request_id=0, prompt='Once upon a time', prompt_token_ids=[128000, 12805, 5304, 264, 892], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' there was a man who fell in love with a woman. The man was so', token_ids=[1070, 574, 264, 893, 889, 11299, 304, 3021, 449, 264, 5333, 13, 578, 893, 574, 779], cumulative_logprob=-21.314169232733548, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1715378569.478381, last_token_time=1715378569.478381, first_scheduled_time=1715378569.480648, first_token_time=1715378569.7070432, time_in_queue=0.002267122268676758, finished_time=1715378570.104807), lora_request=None)]
3055
```
3156

3257
## How to run FP8 quantized models
@@ -36,7 +61,7 @@ options:
3661
Then simply pass the quantized checkpoint directly to vLLM's entrypoints! It will detect the checkpoint format using the `quantization_config` in the `config.json`.
3762
```python
3863
from vllm import LLM
39-
model = LLM("nm-testing/Meta-Llama-3-8B-Instruct-FP8")
64+
model = LLM("neuralmagic/Meta-Llama-3-8B-Instruct-FP8")
4065
# INFO 05-06 10:06:23 model_runner.py:172] Loading model weights took 8.4596 GB
4166

4267
outputs = model.generate("Once upon a time,")

auto_fp8/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .modeling import AutoFP8ForCausalLM
2+
from .config import BaseQuantizeConfig
3+
4+
__all__ = [
5+
"AutoFP8ForCausalLM",
6+
"BaseQuantizeConfig",
7+
]

auto_fp8/config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
class BaseQuantizeConfig:
2+
def __init__(self, quant_method="fp8", activation_scheme="static"):
3+
if quant_method != "fp8":
4+
raise ValueError("Only FP8 quantization is supported.")
5+
if activation_scheme not in ["static", "dynamic"]:
6+
raise ValueError(
7+
"Invalid activation_scheme. Choose either 'static' or 'dynamic'."
8+
)
9+
self.quant_method = quant_method
10+
self.activation_scheme = activation_scheme

0 commit comments

Comments
 (0)