-
Notifications
You must be signed in to change notification settings - Fork 55
enable regex quantization config saving for mixed bits #825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
2b1577b
56a2218
db99785
81e8086
d5b9a46
4e58090
c75ebdc
ae20df7
21ff4b9
b97f3fc
be7af05
b91bf20
cd5c693
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,9 @@ | |
sys.path.insert(0, "../..") | ||
import torch | ||
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer | ||
|
||
from auto_round import AutoRound | ||
from auto_round.testing_utils import require_gptqmodel | ||
|
||
from auto_round import AutoRound | ||
|
||
class LLMDataLoader: | ||
def __init__(self): | ||
|
@@ -25,7 +24,7 @@ def __iter__(self): | |
class TestAutoRound(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(self): | ||
model_name = "facebook/opt-125m" | ||
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" | ||
self.save_dir = "./saved" | ||
self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) | ||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | ||
|
@@ -35,11 +34,11 @@ def setUpClass(self): | |
def tearDownClass(self): | ||
shutil.rmtree("./saved", ignore_errors=True) | ||
shutil.rmtree("runs", ignore_errors=True) | ||
|
||
@require_gptqmodel | ||
def test_mixed_gptqmodel(self): | ||
bits, sym, group_size = 4, True, 128 | ||
model_name = "facebook/opt-125m" | ||
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" | ||
layer_config = { | ||
"k_proj": {"bits": 8}, | ||
"lm_head": {"bits": 16}, | ||
|
@@ -58,17 +57,16 @@ def test_mixed_gptqmodel(self): | |
quantized_model_path = "./saved" | ||
autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") | ||
from gptqmodel import GPTQModel | ||
|
||
model = GPTQModel.load(quantized_model_path) | ||
assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8 | ||
assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4 | ||
result = model.generate("Uncovering deep insights begins with")[0] # tokens | ||
assert "!!!" not in model.tokenizer.decode(result) # string output | ||
assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8) | ||
assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4) | ||
|
||
result = model.generate("Uncovering deep insights begins with")[0] # tokens | ||
assert("!!!" not in model.tokenizer.decode(result)) # string output | ||
shutil.rmtree(quantized_model_path, ignore_errors=True) | ||
|
||
def test_mixed_autoround_format(self): | ||
bits, sym, group_size = 4, True, 128 | ||
model_name = "facebook/opt-125m" | ||
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" | ||
layer_config = { | ||
"k_proj": {"bits": 8}, | ||
"q_proj": {"bits": 3}, | ||
|
@@ -88,8 +86,8 @@ def test_mixed_autoround_format(self): | |
quantized_model_path = "./saved" | ||
autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") | ||
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") | ||
assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 | ||
assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 | ||
assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8) | ||
assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3) | ||
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) | ||
text = "There is a girl who likes adventure," | ||
inputs = tokenizer(text, return_tensors="pt").to(model.device) | ||
|
@@ -115,7 +113,6 @@ def test_mixed_autoround_format_vllm(self): | |
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") | ||
|
||
from vllm import LLM, SamplingParams | ||
|
||
# Sample prompts. | ||
prompts = [ | ||
"The capital of France is", | ||
|
@@ -124,7 +121,7 @@ def test_mixed_autoround_format_vllm(self): | |
# Create a sampling params object. | ||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) | ||
# Create an LLM. | ||
QUANTIZATION = "auto-round" # quantized_model_path | ||
QUANTIZATION = "auto-round" #quantized_model_path | ||
llm = LLM(model=quantized_model_path, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1) | ||
outputs = llm.generate(prompts, sampling_params) | ||
# Print the outputs. | ||
|
@@ -136,16 +133,13 @@ def test_mixed_autoround_format_vllm(self): | |
print(f"{prompt}: {generated_text}") | ||
shutil.rmtree(quantized_model_path, ignore_errors=True) | ||
|
||
|
||
def test_mixed_llmcompressor_format_vllm(self): | ||
model_name = "facebook/opt-125m" | ||
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" | ||
layer_config = { | ||
"self_attn": {"bits": 16, "act_bits": 16, "data_type": "float"}, | ||
"lm_head": {"bits": 16, "act_bits": 16, "data_type": "float"}, | ||
"fc1": { | ||
"bits": 16, | ||
"act_bits": 16, | ||
"data_type": "float", | ||
}, | ||
"fc1": {"bits": 16, "act_bits": 16, "data_type": "float", }, | ||
} | ||
autoround = AutoRound( | ||
model_name, | ||
|
@@ -156,11 +150,8 @@ def test_mixed_llmcompressor_format_vllm(self): | |
layer_config=layer_config, | ||
) | ||
quantized_model_path = self.save_dir | ||
compressed, _ = autoround.quantize_and_save( | ||
output_dir=quantized_model_path, inplace=False, format="llm_compressor" | ||
) | ||
compressed,_ = autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="llm_compressor") | ||
from vllm import LLM, SamplingParams | ||
|
||
# Sample prompts. | ||
prompts = [ | ||
"The capital of France is", | ||
|
@@ -169,7 +160,7 @@ def test_mixed_llmcompressor_format_vllm(self): | |
# Create a sampling params object. | ||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) | ||
# Create an LLM. | ||
QUANTIZATION = "auto-round" # quantized_model_path | ||
QUANTIZATION = "auto-round" #quantized_model_path | ||
llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1) | ||
outputs = llm.generate(prompts, sampling_params) | ||
# Print the outputs. | ||
|
@@ -181,5 +172,7 @@ def test_mixed_llmcompressor_format_vllm(self): | |
shutil.rmtree(quantized_model_path, ignore_errors=True) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For AutoRound format, please make sure inference is ready first then support it on exporting side |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if its 16 bits, we could convert it to not_convert_module, I forget the name